Merge pull request #191 from peastman/master

Further improvements to CPU platform

Merge pull request #191 from peastman/master
Further improvements to CPU platform
eda69200 · peastman · 9c656d86 · a69227ca · eda69200 · eda69200
Commit eda69200 authored Oct 30, 2013 by peastman
11 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,9 +296,9 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
 # On Linux need to link to libdl
 FIND_LIBRARY(DL_LIBRARY dl)
 IF(DL_LIBRARY)
-  TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY})
+  TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
  IF(OPENMM_BUILD_STATIC_LIB)
-    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY})
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
  ENDIF(OPENMM_BUILD_STATIC_LIB)
 ENDIF(DL_LIBRARY)
 IF(WIN32)

--- a/openmmapi/include/openmm/internal/ThreadPool.h
+++ b/openmmapi/include/openmm/internal/ThreadPool.h
+#ifndef OPENMM_THREAD_POOL_H_
+#define OPENMM_THREAD_POOL_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "windowsExport.h"
+#include <pthread.h>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * A ThreadPool creates a set of worker threads that can be used to execute tasks in parallel.
+ * After creating a ThreadPool, call execute() to start a task running then waitForThreads()
+ * to block until all threads have finished.  You also can synchronize the threads in the middle
+ * of the task by having them call syncThreads().  In this case, the parent thread should call
+ * waitForThreads() an additional time; each call waits until all worker threads have reached the
+ * next syncThreads(), and the final call waits until they exit from the Task's execute() method.
+ * After calling waitForThreads() to block at a synchronization point, the parent thread should
+ * call resumeThreads() to instruct the worker threads to resume.
+ */
+class OPENMM_EXPORT ThreadPool {
+public:
+    class Task;
+    class ThreadData;
+    ThreadPool();
+    ~ThreadPool();
+    /**
+     * Get the number of worker threads in the pool.
+     */
+    int getNumThreads() const;
+    /**
+     * Execute a Task in parallel on the worker threads.
+     */
+    void execute(Task& task);
+    /**
+     * This is called by the worker threads to block until all threads have reached the same point
+     * and the master thread instructs them to continue by calling resumeThreads().
+     */
+    void syncThreads();
+    /**
+     * This is called by the master thread to wait until all threads have completed the Task.  Alternatively,
+     * if the threads call syncThreads(), this blocks until all threads have reached the synchronization point.
+     */
+    void waitForThreads();
+    /**
+     * Instruct the threads to resume running after blocking at a synchronization point.
+     */
+    void resumeThreads();
+private:
+    bool isDeleted;
+    int numThreads, waitCount;
+    std::vector<pthread_t> thread;
+    std::vector<ThreadData*> threadData;
+    pthread_cond_t startCondition, endCondition;
+    pthread_mutex_t lock;
+};
+
+/**
+ * This defines a task that can be executed in parallel by the worker threads.
+ */
+class OPENMM_EXPORT ThreadPool::Task {
+public:
+    /**
+     * Execute the task on each thread.
+     * 
+     * @param pool         the ThreadPool being used to execute the task
+     * @param threadIndex  the index of the thread invoking this method
+     */
+    virtual void execute(ThreadPool& pool, int threadIndex) = 0;
+};
+
+} // namespace OpenMM
+
+#endif // OPENMM_THREAD_POOL_H_
--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -212,6 +212,10 @@ static inline float dot4(fvec4 v1, fvec4 v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
 }

+static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
+    _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
+}
+
 // Functions that operate on ivec4s.

 static inline ivec4 min(ivec4 v1, ivec4 v2) {

--- a/openmmapi/src/ThreadPool.cpp
+++ b/openmmapi/src/ThreadPool.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/internal/ThreadPool.h"
+#include "openmm/internal/hardware.h"
+
+using namespace std;
+
+namespace OpenMM {
+
+class ThreadPool::ThreadData {
+public:
+    ThreadData(ThreadPool& owner, int index) : owner(owner), index(index), isDeleted(false) {
+    }
+    ThreadPool& owner;
+    int index;
+    bool isDeleted;
+    Task* currentTask;
+};
+
+static void* threadBody(void* args) {
+    ThreadPool::ThreadData& data = *reinterpret_cast<ThreadPool::ThreadData*>(args);
+    while (true) {
+        // Wait for the signal to start running.
+        
+        data.owner.syncThreads();
+        if (data.isDeleted)
+            break;
+        data.currentTask->execute(data.owner, data.index);
+    }
+    delete &data;
+    return 0;
+}
+
+ThreadPool::ThreadPool() {
+    numThreads = getNumProcessors();
+    pthread_cond_init(&startCondition, NULL);
+    pthread_cond_init(&endCondition, NULL);
+    pthread_mutex_init(&lock, NULL);
+    thread.resize(numThreads);
+    pthread_mutex_lock(&lock);
+    waitCount = 0;
+    for (int i = 0; i < numThreads; i++) {
+        ThreadData* data = new ThreadData(*this, i);
+        data->isDeleted = false;
+        threadData.push_back(data);
+        pthread_create(&thread[i], NULL, threadBody, data);
+    }
+    while (waitCount < numThreads)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+ThreadPool::~ThreadPool() {
+    for (int i = 0; i < (int) threadData.size(); i++)
+        threadData[i]->isDeleted = true;
+    pthread_mutex_lock(&lock);
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+    for (int i = 0; i < (int) thread.size(); i++)
+        pthread_join(thread[i], NULL);
+    pthread_mutex_destroy(&lock);
+    pthread_cond_destroy(&startCondition);
+    pthread_cond_destroy(&endCondition);
+}
+
+int ThreadPool::getNumThreads() const {
+    return numThreads;
+}
+
+void ThreadPool::execute(Task& task) {
+    for (int i = 0; i < (int) threadData.size(); i++)
+        threadData[i]->currentTask = &task;
+    resumeThreads();
+}
+
+void ThreadPool::syncThreads() {
+    pthread_mutex_lock(&lock);
+    waitCount++;
+    pthread_cond_signal(&endCondition);
+    pthread_cond_wait(&startCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+void ThreadPool::waitForThreads() {
+    pthread_mutex_lock(&lock);
+    while (waitCount < numThreads)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+void ThreadPool::resumeThreads() {
+    pthread_mutex_lock(&lock);
+    waitCount = 0;
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+}
+
+} // namespace OpenMM
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -37,6 +37,7 @@
 #include "CpuNonbondedForce.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"

 namespace OpenMM {

@@ -90,6 +91,7 @@ private:
    NonbondedMethod nonbondedMethod;
    CpuNeighborList neighborList;
    CpuNonbondedForce nonbonded;
+    ThreadPool threads;
    Kernel optimizedPme;
 };


--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
 #ifndef OPENMM_CPU_NEIGHBORLIST_H_
 #define OPENMM_CPU_NEIGHBORLIST_H_

+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
 #include "windowsExportCpu.h"
-#include <pthread.h>
+#include "openmm/internal/ThreadPool.h"
 #include <set>
 #include <utility>
 #include <vector>
@@ -11,13 +42,12 @@ namespace OpenMM {
    
 class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
-    class ThreadData;
+    class ThreadTask;
    class Voxels;
    static const int BlockSize;
    CpuNeighborList();
-    ~CpuNeighborList();
    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
-            const float* periodicBoxSize, bool usePeriodic, float maxDistance);
+            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    const std::vector<int>& getSortedAtoms() const;
    const std::vector<int>& getBlockNeighbors(int blockIndex) const;
@@ -25,25 +55,12 @@ public:
    /**
     * This routine contains the code executed by each thread.
     */
+    void threadComputeNeighborList(ThreadPool& threads, int threadIndex);
    void runThread(int index);
 private:
-    /**
-     * This is called by the worker threads to wait until the master thread instructs them to advance.
-     */
-    void threadWait();
-    /**
-     * This is called by the master thread to instruct all the worker threads to advance.
-     */
-    void advanceThreads();
-    bool isDeleted;
-    int numThreads, waitCount;
    std::vector<int> sortedAtoms;
    std::vector<std::vector<int> > blockNeighbors;
    std::vector<std::vector<char> > blockExclusions;
-    std::vector<pthread_t> thread;
-    std::vector<ThreadData*> threadData;
-    pthread_cond_t startCondition, endCondition;
-    pthread_mutex_t lock;
    // The following variables are used to make information accessible to the individual threads.
    float minx, maxx, miny, maxy, minz, maxz;
    std::vector<std::pair<int, int> > atomBins;
@@ -58,4 +75,4 @@ private:

 } // namespace OpenMM

-#endif // OPENMM_REFERENCE_NEIGHBORLIST_H_
+#endif // OPENMM_CPU_NEIGHBORLIST_H_
--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -27,8 +27,8 @@

 #include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
+#include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
-#include <pthread.h>
 #include <set>
 #include <utility>
 #include <vector>
@@ -38,7 +38,7 @@ namespace OpenMM {

 class CpuNonbondedForce {
    public:
-        class ThreadData;
+        class ComputeDirectTask;

      /**---------------------------------------------------------------------------------------
      
@@ -48,14 +48,6 @@ class CpuNonbondedForce {

       CpuNonbondedForce();

-      /**---------------------------------------------------------------------------------------
-      
-         Destructor
-      
-         --------------------------------------------------------------------------------------- */
-
-       ~CpuNonbondedForce();
-
      /**---------------------------------------------------------------------------------------
      
         Set the force to use a cutoff.
@@ -145,16 +137,17 @@ class CpuNonbondedForce {
                                 exclusions[atomIndex] contains the list of exclusions for that atom
         @param forces           force array (forces added)
         @param totalEnergy      total energy
+         @param threads          the thread pool to use
      
         --------------------------------------------------------------------------------------- */
          
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, float* forces, float* totalEnergy);
+            const std::vector<std::set<int> >& exclusions, float* forces, float* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
     */
-    void runThread(int index, std::vector<float>& threadForce, double& threadEnergy);
+    void threadComputeDirect(ThreadPool& threads, int threadIndex);

 private:
        bool cutoff;
@@ -171,12 +164,8 @@ private:
        int meshDim[3];
        std::vector<float> ewaldScaleTable;
        float ewaldDX, ewaldDXInv;
-        bool isDeleted;
-        int numThreads, waitCount;
-        std::vector<pthread_t> thread;
-        std::vector<ThreadData*> threadData;
-        pthread_cond_t startCondition, endCondition;
-        pthread_mutex_t lock;
+        std::vector<std::vector<float> > threadForce;
+        std::vector<double> threadEnergy;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
        float* posq;
@@ -230,6 +219,12 @@ private:
       */
      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+
      /**
       * Compute a fast approximation to erfc(x).
       */

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -260,7 +260,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                }
        }
        if (needRecompute) {
-            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
+            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding, threads);
            lastPositions = posData;
        }
        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
@@ -279,7 +279,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        nonbonded.setUseSwitchingFunction(switchingDistance);
    float nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded.calculateDirectIxn(numParticles, &posq[0], particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL);
+        nonbonded.calculateDirectIxn(numParticles, &posq[0], particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL, threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &forces[0], numParticles);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
 #include "CpuNeighborList.h"
 #include "openmm/internal/hardware.h"
 #include "openmm/internal/vectorize.h"
@@ -255,54 +286,21 @@ private:
    vector<vector<vector<pair<float, int> > > > bins;
 };

-class CpuNeighborList::ThreadData {
+class CpuNeighborList::ThreadTask : public ThreadPool::Task {
 public:
-    ThreadData(int index, CpuNeighborList& owner) : index(index), owner(owner) {
+    ThreadTask(CpuNeighborList& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeNeighborList(threads, threadIndex);
    }
-    int index;
    CpuNeighborList& owner;
 };

-static void* threadBody(void* args) {
-    CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args);
-    data.owner.runThread(data.index);
-    delete &data;
-    return 0;
-}
-
 CpuNeighborList::CpuNeighborList() {
-    isDeleted = false;
-    numThreads = getNumProcessors();
-    pthread_cond_init(&startCondition, NULL);
-    pthread_cond_init(&endCondition, NULL);
-    pthread_mutex_init(&lock, NULL);
-    thread.resize(numThreads);
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    for (int i = 0; i < numThreads; i++) {
-        ThreadData* data = new ThreadData(i, *this);
-        threadData.push_back(data);
-        pthread_create(&thread[i], NULL, threadBody, data);
-    }
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
-
-CpuNeighborList::~CpuNeighborList() {
-    isDeleted = true;
-    pthread_mutex_lock(&lock);
-    pthread_cond_broadcast(&startCondition);
-    pthread_mutex_unlock(&lock);
-    for (int i = 0; i < (int) thread.size(); i++)
-        pthread_join(thread[i], NULL);
-    pthread_mutex_destroy(&lock);
-    pthread_cond_destroy(&startCondition);
-    pthread_cond_destroy(&endCondition);
 }

 void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
-            const float* periodicBoxSize, bool usePeriodic, float maxDistance) {
+            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
@@ -336,8 +334,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    // Sort the atoms based on a Hilbert curve.
    
    atomBins.resize(numAtoms);
-    pthread_mutex_lock(&lock);
-    advanceThreads();
+    ThreadTask task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());

    // Build the voxel hash.
@@ -360,8 +359,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    
    // Signal the threads to start running and wait for them to finish.
    
-    advanceThreads();
-    pthread_mutex_unlock(&lock);
+    threads.resumeThreads();
+    threads.waitForThreads();
    
    // Add padding atoms to fill up the last block.
    
@@ -393,82 +392,55 @@ const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) con
    
 }

-void CpuNeighborList::runThread(int index) {
-    while (true) {
-        // Wait for the signal to start running.
-        
-        threadWait();
-        if (isDeleted)
-            break;
-        
-        // Compute the positions of atoms along the Hilbert curve.
+void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadIndex) {
+    // Compute the positions of atoms along the Hilbert curve.

-        float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
-        float invBinWidth = 1.0f/binWidth;
-        bitmask_t coords[3];
-        for (int i = index; i < numAtoms; i += numThreads) {
-            const float* pos = &atomLocations[4*i];
-            coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
-            coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
-            coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
-            int bin = (int) hilbert_c2i(3, 8, coords);
-            atomBins[i] = pair<int, int>(bin, i);
-        }
-        threadWait();
-        
-        // Compute this thread's subset of neighbors.
-        
-        int numBlocks = blockNeighbors.size();
-        vector<int> blockAtoms;
-        for (int i = index; i < numBlocks; i += numThreads) {
-            {
-            int firstIndex = BlockSize*i;
-            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
-            blockAtoms.resize(atomsInBlock);
-            for (int j = 0; j < atomsInBlock; j++)
-                blockAtoms[j] = sortedAtoms[firstIndex+j];
-            }
+    float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
+    float invBinWidth = 1.0f/binWidth;
+    bitmask_t coords[3];
+    int numThreads = threads.getNumThreads();
+    for (int i = threadIndex; i < numAtoms; i += numThreads) {
+        const float* pos = &atomLocations[4*i];
+        coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
+        coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
+        coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
+        int bin = (int) hilbert_c2i(3, 8, coords);
+        atomBins[i] = pair<int, int>(bin, i);
+    }
+    threads.syncThreads();

-                        
-            int firstIndex = BlockSize*i;
-            fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
-            fvec4 maxPos = minPos;
-            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
-            for (int j = 1; j < atomsInBlock; j++) {
-                fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
-                minPos = min(minPos, pos);
-                maxPos = max(maxPos, pos);
-            }
-            voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
-            
-            // Record the exclusions for this block.
-            
-            for (int j = 0; j < atomsInBlock; j++) {
-                const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
-                char mask = 1<<j;
-                for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
-                    int atomIndex = blockNeighbors[i][k];
-                    if (atomExclusions.find(atomIndex) != atomExclusions.end())
-                        blockExclusions[i][k] |= mask;
-                }
-            }
+    // Compute this thread's subset of neighbors.
+
+    int numBlocks = blockNeighbors.size();
+    vector<int> blockAtoms;
+    for (int i = threadIndex; i < numBlocks; i += numThreads) {
+        // Find the atoms in this block and compute their bounding box.
+        
+        int firstIndex = BlockSize*i;
+        int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
+        blockAtoms.resize(atomsInBlock);
+        for (int j = 0; j < atomsInBlock; j++)
+            blockAtoms[j] = sortedAtoms[firstIndex+j];
+        fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
+        fvec4 maxPos = minPos;
+        for (int j = 1; j < atomsInBlock; j++) {
+            fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
+            minPos = min(minPos, pos);
+            maxPos = max(maxPos, pos);
        }
-    }
-}
+        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);

-void CpuNeighborList::threadWait() {
-    pthread_mutex_lock(&lock);
-    waitCount++;
-    pthread_cond_signal(&endCondition);
-    pthread_cond_wait(&startCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
+        // Record the exclusions for this block.

-void CpuNeighborList::advanceThreads() {
-    waitCount = 0;
-    pthread_cond_broadcast(&startCondition);
-    while (waitCount < numThreads) {
-        pthread_cond_wait(&endCondition, &lock);
+        for (int j = 0; j < atomsInBlock; j++) {
+            const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
+            char mask = 1<<j;
+            for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
+                int atomIndex = blockNeighbors[i][k];
+                if (atomExclusions.find(atomIndex) != atomExclusions.end())
+                    blockExclusions[i][k] |= mask;
+            }
+        }
    }
 }


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -30,7 +30,6 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/hardware.h"
 #include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"

@@ -44,23 +43,16 @@ using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
 const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;

-class CpuNonbondedForce::ThreadData {
+class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
 public:
-    ThreadData(int index, CpuNonbondedForce& owner) : index(index), owner(owner) {
+    ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeDirect(threads, threadIndex);
    }
-    int index;
    CpuNonbondedForce& owner;
-    vector<float> threadForce;
-    double threadEnergy;
 };

-static void* threadBody(void* args) {
-    CpuNonbondedForce::ThreadData& data = *reinterpret_cast<CpuNonbondedForce::ThreadData*>(args);
-    data.owner.runThread(data.index, data.threadForce, data.threadEnergy);
-    delete &data;
-    return 0;
-}
-
 /**---------------------------------------------------------------------------------------

   CpuNonbondedForce constructor
@@ -68,40 +60,6 @@ static void* threadBody(void* args) {
   --------------------------------------------------------------------------------------- */

 CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
-    isDeleted = false;
-    numThreads = getNumProcessors();
-    pthread_cond_init(&startCondition, NULL);
-    pthread_cond_init(&endCondition, NULL);
-    pthread_mutex_init(&lock, NULL);
-    thread.resize(numThreads);
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    for (int i = 0; i < numThreads; i++) {
-        ThreadData* data = new ThreadData(i, *this);
-        threadData.push_back(data);
-        pthread_create(&thread[i], NULL, threadBody, data);
-    }
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
-
-/**---------------------------------------------------------------------------------------
-
-   CpuNonbondedForce destructor
-
-   --------------------------------------------------------------------------------------- */
-
-CpuNonbondedForce::~CpuNonbondedForce(){
-    isDeleted = true;
-    pthread_mutex_lock(&lock);
-    pthread_cond_broadcast(&startCondition);
-    pthread_mutex_unlock(&lock);
-    for (int i = 0; i < (int) thread.size(); i++)
-        pthread_join(thread[i], NULL);
-    pthread_mutex_destroy(&lock);
-    pthread_cond_destroy(&startCondition);
-    pthread_cond_destroy(&endCondition);
 }

  /**---------------------------------------------------------------------------------------
@@ -334,7 +292,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, v


 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, float* forces, float* totalEnergy) {
+                const vector<set<int> >& exclusions, float* forces, float* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
@@ -342,25 +300,25 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    this->atomParameters = &atomParameters[0];
    this->exclusions = &exclusions[0];
    includeEnergy = (totalEnergy != NULL);
+    threadEnergy.resize(threads.getNumThreads());
+    threadForce.resize(threads.getNumThreads());
    
    // Signal the threads to start running and wait for them to finish.
    
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    pthread_cond_broadcast(&startCondition);
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
+    ComputeDirectTask task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
    
    // Combine the results from all the threads.
    
    double directEnergy = 0;
+    int numThreads = threads.getNumThreads();
    for (int i = 0; i < numThreads; i++)
-        directEnergy += threadData[i]->threadEnergy;
+        directEnergy += threadEnergy[i];
    for (int i = 0; i < numberOfAtoms; i++) {
        fvec4 f(forces+4*i);
        for (int j = 0; j < numThreads; j++)
-            f += fvec4(&threadData[j]->threadForce[4*i]);
+            f += fvec4(&threadForce[j][4*i]);
        f.store(forces+4*i);
    }

@@ -368,76 +326,65 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
        *totalEnergy += (float) directEnergy;
 }

+void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex) {
+    // Compute this thread's subset of interactions.
+
+    int numThreads = threads.getNumThreads();
+    threadEnergy[threadIndex] = 0;
+    double* energyPtr = (includeEnergy ? &threadEnergy[threadIndex] : NULL);
+    threadForce[threadIndex].resize(4*numberOfAtoms, 0.0f);
+    float* forces = &threadForce[threadIndex][0];
+    for (int i = 0; i < 4*numberOfAtoms; i++)
+        forces[i] = 0.0f;
+    fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
+    fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
+    if (ewald || pme) {
+        // Compute the interactions from the neighbor list.
+
+        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
+            calculateBlockEwaldIxn(i, forces, energyPtr, boxSize, invBoxSize);
+
+        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.

-void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double& threadEnergy) {
-    while (true) {
-        // Wait for the signal to start running.
-        
-        pthread_mutex_lock(&lock);
-        waitCount++;
-        pthread_cond_signal(&endCondition);
-        pthread_cond_wait(&startCondition, &lock);
-        pthread_mutex_unlock(&lock);
-        if (isDeleted)
-            break;
-        
-        // Compute this thread's subset of interactions.
-        
-        threadEnergy = 0;
-        double* energyPtr = (includeEnergy ? &threadEnergy : NULL);
-        threadForce.resize(4*numberOfAtoms, 0.0f);
-        for (int i = 0; i < 4*numberOfAtoms; i++)
-            threadForce[i] = 0.0f;
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
        fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
-        if (ewald || pme) {
-            // Compute the interactions from the neighbor list.
-
-            for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
-                calculateBlockEwaldIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
-            
-            // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
-
-            fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
-            fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
-            for (int i = index; i < numberOfAtoms; i += numThreads)
-                for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
-                    if (*iter > i) {
-                        int j = *iter;
-                        fvec4 deltaR;
-                        fvec4 posI(posq+4*i);
-                        fvec4 posJ(posq+4*j);
-                        float r2;
-                        getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
-                        float r = sqrtf(r2);
-                        float inverseR = 1/r;
-                        float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
-                        float alphaR = alphaEwald*r;
-                        float erfcAlphaR = erfcApprox(alphaR)[0];
-                        float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
-                        dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
-                        fvec4 result = deltaR*dEdR;
-                        (fvec4(&threadForce[4*i])-result).store(&threadForce[4*i]);
-                        (fvec4(&threadForce[4*j])+result).store(&threadForce[4*j]);
-                        if (includeEnergy)
-                            threadEnergy -= chargeProd*inverseR*(1.0f-erfcAlphaR);
-                    }
+        for (int i = threadIndex; i < numberOfAtoms; i += numThreads)
+            for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
+                if (*iter > i) {
+                    int j = *iter;
+                    fvec4 deltaR;
+                    fvec4 posI(posq+4*i);
+                    fvec4 posJ(posq+4*j);
+                    float r2;
+                    getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
+                    float r = sqrtf(r2);
+                    float inverseR = 1/r;
+                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
+                    float alphaR = alphaEwald*r;
+                    float erfcAlphaR = erfcApprox(alphaR)[0];
+                    float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
+                    dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
+                    fvec4 result = deltaR*dEdR;
+                    (fvec4(forces+4*i)-result).store(forces+4*i);
+                    (fvec4(forces+4*j)+result).store(forces+4*j);
+                    if (includeEnergy)
+                        threadEnergy[threadIndex] -= chargeProd*inverseR*(1.0f-erfcAlphaR);
                }
-        }
-        else if (cutoff) {
-            // Compute the interactions from the neighbor list.
+            }
+    }
+    else if (cutoff) {
+        // Compute the interactions from the neighbor list.

-            for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
-                calculateBlockIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
-        }
-        else {
-            // Loop over all atom pairs
+        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
+            calculateBlockIxn(i, forces, energyPtr, boxSize, invBoxSize);
+    }
+    else {
+        // Loop over all atom pairs

-            for (int i = index; i < numberOfAtoms; i += numThreads){
-                for (int j = i+1; j < numberOfAtoms; j++)
-                    if (exclusions[j].find(i) == exclusions[j].end())
-                        calculateOneIxn(i, j, &threadForce[0], energyPtr, boxSize, invBoxSize);
-            }
+        for (int i = threadIndex; i < numberOfAtoms; i += numThreads){
+            for (int j = i+1; j < numberOfAtoms; j++)
+                if (exclusions[j].find(i) == exclusions[j].end())
+                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
        }
    }
 }
@@ -507,6 +454,9 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
        blockAtomForce[i] = fvec4(0.0f);
    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
@@ -515,9 +465,7 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
    
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    float blockAtomR2[4];
    bool include[4];
-    fvec4 blockAtomDelta[4];
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
        
@@ -527,9 +475,10 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        // Compute the distances to the block atoms.
        
        bool any = false;
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
        for (int j = 0; j < 4; j++) {
-            getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
-            include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || blockAtomR2[j] < cutoffDistance*cutoffDistance));
+            include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || r2[j] < cutoffDistance*cutoffDistance));
            any |= include[j];
        }
        if (!any)
@@ -537,7 +486,6 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        
        // Compute the interactions.
        
-        fvec4 r2(blockAtomR2);
        fvec4 r = sqrt(r2);
        fvec4 inverseR = fvec4(1.0f)/r;
        fvec4 switchValue(1.0f), switchDeriv(0.0f);
@@ -578,12 +526,13 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*

        // Accumulate forces.

+        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
+        transpose(result[0], result[1], result[2], result[3]);
        fvec4 atomForce(forces+4*atom);
        for (int j = 0; j < 4; j++) {
            if (include[j]) {
-                fvec4 result = blockAtomDelta[j]*dEdR[j];
-                blockAtomForce[j] += result;
-                atomForce -= result;
+                blockAtomForce[j] += result[j];
+                atomForce -= result[j];
            }
        }
        atomForce.store(forces+4*atom);
@@ -606,6 +555,9 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
        blockAtomForce[i] = fvec4(0.0f);
    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
@@ -614,9 +566,7 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
    
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    float blockAtomR2[4];
    bool include[4];
-    fvec4 blockAtomDelta[4];
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
        
@@ -626,9 +576,10 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        // Compute the distances to the block atoms.
        
        bool any = false;
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
        for (int j = 0; j < 4; j++) {
-            getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
-            include[j] = (((exclusions[i]>>j)&1) == 0 && blockAtomR2[j] < cutoffDistance*cutoffDistance);
+            include[j] = (((exclusions[i]>>j)&1) == 0 && r2[j] < cutoffDistance*cutoffDistance);
            any |= include[j];
        }
        if (!any)
@@ -636,7 +587,6 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        
        // Compute the interactions.
        
-        fvec4 r2(blockAtomR2);
        fvec4 r = sqrt(r2);
        fvec4 inverseR = fvec4(1.0f)/r;
        fvec4 switchValue(1.0f), switchDeriv(0.0f);
@@ -671,12 +621,13 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do

        // Accumulate forces.

+        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
+        transpose(result[0], result[1], result[2], result[3]);
        fvec4 atomForce(forces+4*atom);
        for (int j = 0; j < 4; j++) {
            if (include[j]) {
-                fvec4 result = blockAtomDelta[j]*dEdR[j];
-                blockAtomForce[j] += result;
-                atomForce -= result;
+                blockAtomForce[j] += result[j];
+                atomForce -= result[j];
            }
        }
        atomForce.store(forces+4*atom);
@@ -697,6 +648,18 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
    r2 = dot3(deltaR, deltaR);
 }

+void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
 fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
@@ -720,7 +683,7 @@ fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    coeff[0] = 1.0f-coeff[1];
    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    _MM_TRANSPOSE4_PS(coeff[0], coeff[1], coeff[2], coeff[3]);
+    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
    for (int i = 0; i < 4; i++) {
        if (index[i] < NUM_TABLE_POINTS)
            y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));

--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -34,6 +34,7 @@
 */

 #include "openmm/internal/AssertionUtilities.h"
+#include "openmm/internal/ThreadPool.h"
 #include "CpuNeighborList.h"
 #include "CpuPlatform.h"
 #include "sfmt/SFMT.h"
@@ -63,8 +64,9 @@ void testNeighborList(bool periodic) {
            exclusions[i-j].insert(i);
        }
    }
+    ThreadPool threads;
    CpuNeighborList neighborList;
-    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff);
+    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads);
    
    // Convert the neighbor list to a set for faster lookup.