Merge branch 'master' of github.com:SimTk/openmm

59854c5e · leeping · 8167c79b · 8e11ed9c · 59854c5e · 59854c5e
Commit 59854c5e authored Oct 30, 2013 by leeping
19 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,9 +296,9 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
 # On Linux need to link to libdl
 FIND_LIBRARY(DL_LIBRARY dl)
 IF(DL_LIBRARY)
-  TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY})
+  TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
  IF(OPENMM_BUILD_STATIC_LIB)
-    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY})
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
  ENDIF(OPENMM_BUILD_STATIC_LIB)
 ENDIF(DL_LIBRARY)
 IF(WIN32)

--- a/openmmapi/include/openmm/internal/ThreadPool.h
+++ b/openmmapi/include/openmm/internal/ThreadPool.h
+#ifndef OPENMM_THREAD_POOL_H_
+#define OPENMM_THREAD_POOL_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "windowsExport.h"
+#include <pthread.h>
+#include <vector>
+namespace OpenMM {
+/**
+ * A ThreadPool creates a set of worker threads that can be used to execute tasks in parallel.
+ * After creating a ThreadPool, call execute() to start a task running then waitForThreads()
+ * to block until all threads have finished.  You also can synchronize the threads in the middle
+ * of the task by having them call syncThreads().  In this case, the parent thread should call
+ * waitForThreads() an additional time; each call waits until all worker threads have reached the
+ * next syncThreads(), and the final call waits until they exit from the Task's execute() method.
+ * After calling waitForThreads() to block at a synchronization point, the parent thread should
+ * call resumeThreads() to instruct the worker threads to resume.
+ */
+class OPENMM_EXPORT ThreadPool {
+public:
+    class Task;
+    class ThreadData;
+    ThreadPool();
+    ~ThreadPool();
+    /**
+     * Get the number of worker threads in the pool.
+     */
+    int getNumThreads() const;
+    /**
+     * Execute a Task in parallel on the worker threads.
+     */
+    void execute(Task& task);
+    /**
+     * This is called by the worker threads to block until all threads have reached the same point
+     * and the master thread instructs them to continue by calling resumeThreads().
+     */
+    void syncThreads();
+    /**
+     * This is called by the master thread to wait until all threads have completed the Task.  Alternatively,
+     * if the threads call syncThreads(), this blocks until all threads have reached the synchronization point.
+     */
+    void waitForThreads();
+    /**
+     * Instruct the threads to resume running after blocking at a synchronization point.
+     */
+    void resumeThreads();
+private:
+    bool isDeleted;
+    int numThreads, waitCount;
+    std::vector<pthread_t> thread;
+    std::vector<ThreadData*> threadData;
+    pthread_cond_t startCondition, endCondition;
+    pthread_mutex_t lock;
+};
+/**
+ * This defines a task that can be executed in parallel by the worker threads.
+ */
+class OPENMM_EXPORT ThreadPool::Task {
+public:
+    /**
+     * Execute the task on each thread.
+     * 
+     * @param pool         the ThreadPool being used to execute the task
+     * @param threadIndex  the index of the thread invoking this method
+     */
+    virtual void execute(ThreadPool& pool, int threadIndex) = 0;
+};
+} // namespace OpenMM
+#endif // OPENMM_THREAD_POOL_H_
--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -212,6 +212,10 @@ static inline float dot4(fvec4 v1, fvec4 v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
 }
+static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
+    _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
+}
 // Functions that operate on ivec4s.
 static inline ivec4 min(ivec4 v1, ivec4 v2) {

--- a/openmmapi/src/ThreadPool.cpp
+++ b/openmmapi/src/ThreadPool.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/internal/ThreadPool.h"
+#include "openmm/internal/hardware.h"
+using namespace std;
+namespace OpenMM {
+class ThreadPool::ThreadData {
+public:
+    ThreadData(ThreadPool& owner, int index) : owner(owner), index(index), isDeleted(false) {
+    }
+    ThreadPool& owner;
+    int index;
+    bool isDeleted;
+    Task* currentTask;
+};
+static void* threadBody(void* args) {
+    ThreadPool::ThreadData& data = *reinterpret_cast<ThreadPool::ThreadData*>(args);
+    while (true) {
+        // Wait for the signal to start running.
+        data.owner.syncThreads();
+        if (data.isDeleted)
+            break;
+        data.currentTask->execute(data.owner, data.index);
+    }
+    delete &data;
+    return 0;
+}
+ThreadPool::ThreadPool() {
+    numThreads = getNumProcessors();
+    pthread_cond_init(&startCondition, NULL);
+    pthread_cond_init(&endCondition, NULL);
+    pthread_mutex_init(&lock, NULL);
+    thread.resize(numThreads);
+    pthread_mutex_lock(&lock);
+    waitCount = 0;
+    for (int i = 0; i < numThreads; i++) {
+        ThreadData* data = new ThreadData(*this, i);
+        data->isDeleted = false;
+        threadData.push_back(data);
+        pthread_create(&thread[i], NULL, threadBody, data);
+    }
+    while (waitCount < numThreads)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+ThreadPool::~ThreadPool() {
+    for (int i = 0; i < (int) threadData.size(); i++)
+        threadData[i]->isDeleted = true;
+    pthread_mutex_lock(&lock);
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+    for (int i = 0; i < (int) thread.size(); i++)
+        pthread_join(thread[i], NULL);
+    pthread_mutex_destroy(&lock);
+    pthread_cond_destroy(&startCondition);
+    pthread_cond_destroy(&endCondition);
+}
+int ThreadPool::getNumThreads() const {
+    return numThreads;
+}
+void ThreadPool::execute(Task& task) {
+    for (int i = 0; i < (int) threadData.size(); i++)
+        threadData[i]->currentTask = &task;
+    resumeThreads();
+}
+void ThreadPool::syncThreads() {
+    pthread_mutex_lock(&lock);
+    waitCount++;
+    pthread_cond_signal(&endCondition);
+    pthread_cond_wait(&startCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+void ThreadPool::waitForThreads() {
+    pthread_mutex_lock(&lock);
+    while (waitCount < numThreads)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+void ThreadPool::resumeThreads() {
+    pthread_mutex_lock(&lock);
+    waitCount = 0;
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+}
+} // namespace OpenMM
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -37,6 +37,7 @@
 #include "CpuNonbondedForce.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"
 namespace OpenMM {
@@ -90,6 +91,7 @@ private:
    NonbondedMethod nonbondedMethod;
    CpuNeighborList neighborList;
    CpuNonbondedForce nonbonded;
+    ThreadPool threads;
    Kernel optimizedPme;
 };

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
 #ifndef OPENMM_CPU_NEIGHBORLIST_H_
 #define OPENMM_CPU_NEIGHBORLIST_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
 #include "windowsExportCpu.h"
-#include <pthread.h>
+#include "openmm/internal/ThreadPool.h"
 #include <set>
 #include <utility>
 #include <vector>
@@ -11,13 +42,12 @@ namespace OpenMM {
 class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
-    class ThreadData;
+    class ThreadTask;
    class Voxels;
    static const int BlockSize;
    CpuNeighborList();
-    ~CpuNeighborList();
    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
-            const float* periodicBoxSize, bool usePeriodic, float maxDistance);
+            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    const std::vector<int>& getSortedAtoms() const;
    const std::vector<int>& getBlockNeighbors(int blockIndex) const;
@@ -25,25 +55,12 @@ public:
    /**
     * This routine contains the code executed by each thread.
     */
+    void threadComputeNeighborList(ThreadPool& threads, int threadIndex);
    void runThread(int index);
 private:
-    /**
-     * This is called by the worker threads to wait until the master thread instructs them to advance.
-     */
-    void threadWait();
-    /**
-     * This is called by the master thread to instruct all the worker threads to advance.
-     */
-    void advanceThreads();
-    bool isDeleted;
-    int numThreads, waitCount;
    std::vector<int> sortedAtoms;
    std::vector<std::vector<int> > blockNeighbors;
    std::vector<std::vector<char> > blockExclusions;
-    std::vector<pthread_t> thread;
-    std::vector<ThreadData*> threadData;
-    pthread_cond_t startCondition, endCondition;
-    pthread_mutex_t lock;
    // The following variables are used to make information accessible to the individual threads.
    float minx, maxx, miny, maxy, minz, maxz;
    std::vector<std::pair<int, int> > atomBins;
@@ -58,4 +75,4 @@ private:
 } // namespace OpenMM
-#endif // OPENMM_REFERENCE_NEIGHBORLIST_H_
+#endif // OPENMM_CPU_NEIGHBORLIST_H_
--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -27,8 +27,8 @@
 #include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
+#include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
-#include <pthread.h>
 #include <set>
 #include <utility>
 #include <vector>
@@ -38,7 +38,7 @@ namespace OpenMM {
 class CpuNonbondedForce {
    public:
-        class ThreadData;
+        class ComputeDirectTask;
      /**---------------------------------------------------------------------------------------
@@ -48,14 +48,6 @@ class CpuNonbondedForce {
       CpuNonbondedForce();
-      /**---------------------------------------------------------------------------------------
-         Destructor
-         --------------------------------------------------------------------------------------- */
-       ~CpuNonbondedForce();
      /**---------------------------------------------------------------------------------------
         Set the force to use a cutoff.
@@ -130,7 +122,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
-      void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<RealVec>& atomCoordinates,
+      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
                            std::vector<RealVec>& forces, float* totalEnergy) const;
@@ -140,21 +132,23 @@ class CpuNonbondedForce {
         @param numberOfAtoms    number of atoms
         @param posq             atom coordinates and charges
+         @param atomCoordinates  atom coordinates (periodic boundary conditions not applied)
         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
         @param exclusions       atom exclusion indices
                                 exclusions[atomIndex] contains the list of exclusions for that atom
         @param forces           force array (forces added)
         @param totalEnergy      total energy
+         @param threads          the thread pool to use
         --------------------------------------------------------------------------------------- */
-      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<std::pair<float, float> >& atomParameters,
+      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, float* forces, float* totalEnergy);
+            const std::vector<std::set<int> >& exclusions, float* forces, float* totalEnergy, ThreadPool& threads);
    /**
     * This routine contains the code executed by each thread.
     */
-    void runThread(int index, std::vector<float>& threadForce, double& threadEnergy);
+    void threadComputeDirect(ThreadPool& threads, int threadIndex);
 private:
        bool cutoff;
@@ -171,15 +165,12 @@ private:
        int meshDim[3];
        std::vector<float> ewaldScaleTable;
        float ewaldDX, ewaldDXInv;
-        bool isDeleted;
+        std::vector<std::vector<float> > threadForce;
-        int numThreads, waitCount;
+        std::vector<double> threadEnergy;
-        std::vector<pthread_t> thread;
-        std::vector<ThreadData*> threadData;
-        pthread_cond_t startCondition, endCondition;
-        pthread_mutex_t lock;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
        float* posq;
+        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
        std::set<int> const* exclusions;
        bool includeEnergy;
@@ -230,6 +221,12 @@ private:
       */
      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
      /**
       * Compute a fast approximation to erfc(x).
       */

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -203,11 +203,11 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    // Convert the positions to single precision.
-    if (periodic)
+    if (periodic || ewald || pme)
        for (int i = 0; i < numParticles; i++)
            for (int j = 0; j < 3; j++) {
                RealOpenMM x = posData[i][j];
-                double base = floor(x/boxSize[j]+0.5)*boxSize[j];
+                double base = floor(x/boxSize[j])*boxSize[j];
                posq[4*i+j] = (float) (x-base);
            }
    else
@@ -244,6 +244,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            int numMoved = moved.size();
            double cutoff2 = nonbondedCutoff*nonbondedCutoff;
+            double paddedCutoff2 = (nonbondedCutoff+padding)*(nonbondedCutoff+padding);
            for (int i = 1; i < numMoved && !needRecompute; i++)
                for (int j = 0; j < i; j++) {
                    RealVec delta = posData[moved[i]]-posData[moved[j]];
@@ -251,7 +252,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                        // These particles should interact.  See if they are in the neighbor list.
                        RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
-                        if (oldDelta.dot(oldDelta) > cutoff2) {
+                        if (oldDelta.dot(oldDelta) > paddedCutoff2) {
                            needRecompute = true;
                            break;
                        }
@@ -259,7 +260,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                }
        }
        if (needRecompute) {
-            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
+            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding, threads);
            lastPositions = posData;
        }
        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
@@ -278,7 +279,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        nonbonded.setUseSwitchingFunction(switchingDistance);
    float nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded.calculateDirectIxn(numParticles, &posq[0], particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL);
+        nonbonded.calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL, threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &forces[0], numParticles);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
 #include "CpuNeighborList.h"
 #include "openmm/internal/hardware.h"
 #include "openmm/internal/vectorize.h"
@@ -23,8 +54,6 @@ public:
    int y;
 };
-typedef pair<const float*, int> VoxelItem;
 /**
 * This data structure organizes the particles spatially.  It divides them into bins along the x and y axes,
 * then sorts each bin along the z axis so ranges can be identified quickly with a binary search.
@@ -60,7 +89,7 @@ public:
     */
    void insert(const int& atom, const float* location) {
        VoxelIndex voxelIndex = getVoxelIndex(location);
-        bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], VoxelItem(location, atom)));
+        bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], atom));
    }
    /**
@@ -76,7 +105,7 @@ public:
     * Find the index of the first particle in voxel (x,y) whose z coordinate in >= the specified value.
     */
    int findLowerBound(int x, int y, double z) const {
-        const vector<pair<float, VoxelItem> >& bin = bins[x][y];
+        const vector<pair<float, int> >& bin = bins[x][y];
        int lower = 0;
        int upper = bin.size();
        while (lower < upper) {
@@ -93,7 +122,7 @@ public:
     * Find the index of the first particle in voxel (x,y) whose z coordinate in greater than the specified value.
     */
    int findUpperBound(int x, int y, double z) const {
-        const vector<pair<float, VoxelItem> >& bin = bins[x][y];
+        const vector<pair<float, int> >& bin = bins[x][y];
        int lower = 0;
        int upper = bin.size();
        while (lower < upper) {
@@ -148,14 +177,12 @@ public:
        if (usePeriodic) {
            endx = min(endx, centerVoxelIndex.x-dIndexX+nx-1);
            endy = min(endy, centerVoxelIndex.y-dIndexY+ny-1);
-            numRanges = 2;
        }
        else {
            startx = max(startx, 0);
            starty = max(starty, 0);
            endx = min(endx, nx-1);
            endy = min(endy, ny-1);
-            numRanges = 1;
        }
        int lastSortedIndex = BlockSize*(blockIndex+1);
        VoxelIndex voxelIndex(0, 0);
@@ -175,10 +202,12 @@ public:
                float dz = maxDistance+blockWidth[2];
                dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
+                bool needPeriodic = (voxelIndex.x != x || voxelIndex.y != y || centerPos[2]-dz < 0.0f || centerPos[2]+dz > periodicBoxSize[2]);
                int rangeStart[2];
                int rangeEnd[2];
                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);
-                if (usePeriodic) {
+                if (needPeriodic) {
+                    numRanges = 2;
                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
                    if (rangeStart[0] > 0) {
                        rangeStart[1] = 0;
@@ -189,22 +218,24 @@ public:
                        rangeEnd[1] = bins[voxelIndex.x][voxelIndex.y].size();
                    }
                }
-                else
+                else {
+                    numRanges = 1;
                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
+                }
                // Loop over atoms and check to see if they are neighbors of this block.
                for (int range = 0; range < numRanges; range++) {
                    for (int item = rangeStart[range]; item < rangeEnd[range]; item++) {
-                        const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second.second;
+                        const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second;
                        // Avoid duplicate entries.
                        if (sortedIndex >= lastSortedIndex)
                            continue;
-                        fvec4 atomPos(bins[voxelIndex.x][voxelIndex.y][item].second.first);
+                        fvec4 atomPos(atomLocations+4*sortedAtoms[sortedIndex]);
                        fvec4 delta = atomPos-centerPos;
-                        if (usePeriodic) {
+                        if (needPeriodic) {
                            fvec4 base = round(delta*invBoxSize)*boxSize;
                            delta = delta-base;
                        }
@@ -221,7 +252,7 @@ public:
                            for (int k = 0; k < (int) blockAtoms.size(); k++) {
                                fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
                                delta = atomPos-pos1;
-                                if (usePeriodic) {
+                                if (needPeriodic) {
                                    fvec4 base = round(delta*invBoxSize)*boxSize;
                                    delta = delta-base;
                                }
@@ -254,57 +285,24 @@ private:
    int nx, ny;
    const float* periodicBoxSize;
    const bool usePeriodic;
-    vector<vector<vector<pair<float, VoxelItem> > > > bins;
+    vector<vector<vector<pair<float, int> > > > bins;
 };
-class CpuNeighborList::ThreadData {
+class CpuNeighborList::ThreadTask : public ThreadPool::Task {
 public:
-    ThreadData(int index, CpuNeighborList& owner) : index(index), owner(owner) {
+    ThreadTask(CpuNeighborList& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeNeighborList(threads, threadIndex);
    }
-    int index;
    CpuNeighborList& owner;
 };
-static void* threadBody(void* args) {
-    CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args);
-    data.owner.runThread(data.index);
-    delete &data;
-    return 0;
-}
 CpuNeighborList::CpuNeighborList() {
-    isDeleted = false;
-    numThreads = getNumProcessors();
-    pthread_cond_init(&startCondition, NULL);
-    pthread_cond_init(&endCondition, NULL);
-    pthread_mutex_init(&lock, NULL);
-    thread.resize(numThreads);
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    for (int i = 0; i < numThreads; i++) {
-        ThreadData* data = new ThreadData(i, *this);
-        threadData.push_back(data);
-        pthread_create(&thread[i], NULL, threadBody, data);
-    }
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
-CpuNeighborList::~CpuNeighborList() {
-    isDeleted = true;
-    pthread_mutex_lock(&lock);
-    pthread_cond_broadcast(&startCondition);
-    pthread_mutex_unlock(&lock);
-    for (int i = 0; i < (int) thread.size(); i++)
-        pthread_join(thread[i], NULL);
-    pthread_mutex_destroy(&lock);
-    pthread_cond_destroy(&startCondition);
-    pthread_cond_destroy(&endCondition);
 }
 void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
-            const float* periodicBoxSize, bool usePeriodic, float maxDistance) {
+            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
@@ -338,8 +336,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    // Sort the atoms based on a Hilbert curve.
    atomBins.resize(numAtoms);
-    pthread_mutex_lock(&lock);
+    ThreadTask task(*this);
-    advanceThreads();
+    threads.execute(task);
+    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());
    // Build the voxel hash.
@@ -362,8 +361,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    // Signal the threads to start running and wait for them to finish.
-    advanceThreads();
+    threads.resumeThreads();
-    pthread_mutex_unlock(&lock);
+    threads.waitForThreads();
    // Add padding atoms to fill up the last block.
@@ -395,20 +394,14 @@ const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) con
 }
-void CpuNeighborList::runThread(int index) {
+void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadIndex) {
-    while (true) {
-        // Wait for the signal to start running.
-        threadWait();
-        if (isDeleted)
-            break;
    // Compute the positions of atoms along the Hilbert curve.
    float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
    float invBinWidth = 1.0f/binWidth;
    bitmask_t coords[3];
-        for (int i = index; i < numAtoms; i += numThreads) {
+    int numThreads = threads.getNumThreads();
+    for (int i = threadIndex; i < numAtoms; i += numThreads) {
        const float* pos = &atomLocations[4*i];
        coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
        coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
@@ -416,26 +409,22 @@ void CpuNeighborList::runThread(int index) {
        int bin = (int) hilbert_c2i(3, 8, coords);
        atomBins[i] = pair<int, int>(bin, i);
    }
-        threadWait();
+    threads.syncThreads();
    // Compute this thread's subset of neighbors.
    int numBlocks = blockNeighbors.size();
    vector<int> blockAtoms;
-        for (int i = index; i < numBlocks; i += numThreads) {
+    for (int i = threadIndex; i < numBlocks; i += numThreads) {
-            {
+        // Find the atoms in this block and compute their bounding box.
        int firstIndex = BlockSize*i;
        int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
        blockAtoms.resize(atomsInBlock);
        for (int j = 0; j < atomsInBlock; j++)
            blockAtoms[j] = sortedAtoms[firstIndex+j];
-            }
-            int firstIndex = BlockSize*i;
        fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
        fvec4 maxPos = minPos;
-            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
        for (int j = 1; j < atomsInBlock; j++) {
            fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
            minPos = min(minPos, pos);
@@ -455,23 +444,6 @@ void CpuNeighborList::runThread(int index) {
            }
        }
    }
-    }
-}
-void CpuNeighborList::threadWait() {
-    pthread_mutex_lock(&lock);
-    waitCount++;
-    pthread_cond_signal(&endCondition);
-    pthread_cond_wait(&startCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
-void CpuNeighborList::advanceThreads() {
-    waitCount = 0;
-    pthread_cond_broadcast(&startCondition);
-    while (waitCount < numThreads) {
-        pthread_cond_wait(&endCondition, &lock);
-    }
 }
 } // namespace OpenMM
--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -30,7 +30,6 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/hardware.h"
 #include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
@@ -44,23 +43,16 @@ using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
 const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;
-class CpuNonbondedForce::ThreadData {
+class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
 public:
-    ThreadData(int index, CpuNonbondedForce& owner) : index(index), owner(owner) {
+    ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeDirect(threads, threadIndex);
    }
-    int index;
    CpuNonbondedForce& owner;
-    vector<float> threadForce;
-    double threadEnergy;
 };
-static void* threadBody(void* args) {
-    CpuNonbondedForce::ThreadData& data = *reinterpret_cast<CpuNonbondedForce::ThreadData*>(args);
-    data.owner.runThread(data.index, data.threadForce, data.threadEnergy);
-    delete &data;
-    return 0;
-}
 /**---------------------------------------------------------------------------------------
   CpuNonbondedForce constructor
@@ -68,40 +60,6 @@ static void* threadBody(void* args) {
   --------------------------------------------------------------------------------------- */
 CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
-    isDeleted = false;
-    numThreads = getNumProcessors();
-    pthread_cond_init(&startCondition, NULL);
-    pthread_cond_init(&endCondition, NULL);
-    pthread_mutex_init(&lock, NULL);
-    thread.resize(numThreads);
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    for (int i = 0; i < numThreads; i++) {
-        ThreadData* data = new ThreadData(i, *this);
-        threadData.push_back(data);
-        pthread_create(&thread[i], NULL, threadBody, data);
-    }
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
-/**---------------------------------------------------------------------------------------
-   CpuNonbondedForce destructor
-   --------------------------------------------------------------------------------------- */
-CpuNonbondedForce::~CpuNonbondedForce(){
-    isDeleted = true;
-    pthread_mutex_lock(&lock);
-    pthread_cond_broadcast(&startCondition);
-    pthread_mutex_unlock(&lock);
-    for (int i = 0; i < (int) thread.size(); i++)
-        pthread_join(thread[i], NULL);
-    pthread_mutex_destroy(&lock);
-    pthread_cond_destroy(&startCondition);
-    pthread_cond_destroy(&endCondition);
 }
  /**---------------------------------------------------------------------------------------
@@ -219,7 +177,7 @@ void CpuNonbondedForce::tabulateEwaldScaleFactor() {
    }
 }
-void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, vector<RealVec>& atomCoordinates,
+void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
                                             vector<RealVec>& forces, float* totalEnergy) const {
    typedef std::complex<float> d_complex;
@@ -333,34 +291,35 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, v
 }
-void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<pair<float, float> >& atomParameters,
+void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, float* forces, float* totalEnergy) {
+                const vector<set<int> >& exclusions, float* forces, float* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
    this->posq = posq;
+    this->atomCoordinates = &atomCoordinates[0];
    this->atomParameters = &atomParameters[0];
    this->exclusions = &exclusions[0];
    includeEnergy = (totalEnergy != NULL);
+    threadEnergy.resize(threads.getNumThreads());
+    threadForce.resize(threads.getNumThreads());
    // Signal the threads to start running and wait for them to finish.
-    pthread_mutex_lock(&lock);
+    ComputeDirectTask task(*this);
-    waitCount = 0;
+    threads.execute(task);
-    pthread_cond_broadcast(&startCondition);
+    threads.waitForThreads();
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
    // Combine the results from all the threads.
    double directEnergy = 0;
+    int numThreads = threads.getNumThreads();
    for (int i = 0; i < numThreads; i++)
-        directEnergy += threadData[i]->threadEnergy;
+        directEnergy += threadEnergy[i];
    for (int i = 0; i < numberOfAtoms; i++) {
        fvec4 f(forces+4*i);
        for (int j = 0; j < numThreads; j++)
-            f += fvec4(&threadData[j]->threadForce[4*i]);
+            f += fvec4(&threadForce[j][4*i]);
        f.store(forces+4*i);
    }
@@ -368,45 +327,35 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
        *totalEnergy += (float) directEnergy;
 }
+void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex) {
-void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double& threadEnergy) {
-    while (true) {
-        // Wait for the signal to start running.
-        pthread_mutex_lock(&lock);
-        waitCount++;
-        pthread_cond_signal(&endCondition);
-        pthread_cond_wait(&startCondition, &lock);
-        pthread_mutex_unlock(&lock);
-        if (isDeleted)
-            break;
    // Compute this thread's subset of interactions.
-        threadEnergy = 0;
+    int numThreads = threads.getNumThreads();
-        double* energyPtr = (includeEnergy ? &threadEnergy : NULL);
+    threadEnergy[threadIndex] = 0;
-        threadForce.resize(4*numberOfAtoms, 0.0f);
+    double* energyPtr = (includeEnergy ? &threadEnergy[threadIndex] : NULL);
+    threadForce[threadIndex].resize(4*numberOfAtoms, 0.0f);
+    float* forces = &threadForce[threadIndex][0];
    for (int i = 0; i < 4*numberOfAtoms; i++)
-            threadForce[i] = 0.0f;
+        forces[i] = 0.0f;
    fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
    fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
    if (ewald || pme) {
        // Compute the interactions from the neighbor list.
-            for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
+        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
-                calculateBlockEwaldIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
+            calculateBlockEwaldIxn(i, forces, energyPtr, boxSize, invBoxSize);
        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
        fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
-            for (int i = index; i < numberOfAtoms; i += numThreads)
+        for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
+            fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
            for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
                if (*iter > i) {
                    int j = *iter;
                    fvec4 deltaR;
-                        fvec4 posI(posq+4*i);
+                    fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
-                        fvec4 posJ(posq+4*j);
                    float r2;
                    getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
                    float r = sqrtf(r2);
@@ -417,27 +366,27 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double&
                    float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
                    dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
                    fvec4 result = deltaR*dEdR;
-                        (fvec4(&threadForce[4*i])-result).store(&threadForce[4*i]);
+                    (fvec4(forces+4*i)-result).store(forces+4*i);
-                        (fvec4(&threadForce[4*j])+result).store(&threadForce[4*j]);
+                    (fvec4(forces+4*j)+result).store(forces+4*j);
                    if (includeEnergy)
-                            threadEnergy -= chargeProd*inverseR*(1.0f-erfcAlphaR);
+                        threadEnergy[threadIndex] -= chargeProd*inverseR*(1.0f-erfcAlphaR);
+                }
            }
        }
    }
    else if (cutoff) {
        // Compute the interactions from the neighbor list.
-            for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
+        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
-                calculateBlockIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
+            calculateBlockIxn(i, forces, energyPtr, boxSize, invBoxSize);
    }
    else {
        // Loop over all atom pairs
-            for (int i = index; i < numberOfAtoms; i += numThreads){
+        for (int i = threadIndex; i < numberOfAtoms; i += numThreads){
            for (int j = i+1; j < numberOfAtoms; j++)
                if (exclusions[j].find(i) == exclusions[j].end())
-                        calculateOneIxn(i, j, &threadForce[0], energyPtr, boxSize, invBoxSize);
+                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
-            }
        }
    }
 }
@@ -507,17 +456,27 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
        blockAtomForce[i] = fvec4(0.0f);
    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    bool needPeriodic = false;
+    if (periodic) {
+        for (int i = 0; i < 4 && !needPeriodic; i++)
+            for (int j = 0; j < 3; j++)
+                if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
+                    needPeriodic = true;
+                    break;
+                }
+    }
    // Loop over neighbors for this block.
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    float blockAtomR2[4];
    bool include[4];
-    fvec4 blockAtomDelta[4];
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
@@ -527,9 +486,10 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        // Compute the distances to the block atoms.
        bool any = false;
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        for (int j = 0; j < 4; j++) {
-            getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
+            include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || r2[j] < cutoffDistance*cutoffDistance));
-            include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || blockAtomR2[j] < cutoffDistance*cutoffDistance));
            any |= include[j];
        }
        if (!any)
@@ -537,7 +497,6 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        // Compute the interactions.
-        fvec4 r2(blockAtomR2);
        fvec4 r = sqrt(r2);
        fvec4 inverseR = fvec4(1.0f)/r;
        fvec4 switchValue(1.0f), switchDeriv(0.0f);
@@ -578,12 +537,13 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        // Accumulate forces.
+        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
+        transpose(result[0], result[1], result[2], result[3]);
        fvec4 atomForce(forces+4*atom);
        for (int j = 0; j < 4; j++) {
            if (include[j]) {
-                fvec4 result = blockAtomDelta[j]*dEdR[j];
+                blockAtomForce[j] += result[j];
-                blockAtomForce[j] += result;
+                atomForce -= result[j];
-                atomForce -= result;
            }
        }
        atomForce.store(forces+4*atom);
@@ -606,17 +566,25 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
        blockAtomForce[i] = fvec4(0.0f);
    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    bool needPeriodic = false;
+    for (int i = 0; i < 4 && !needPeriodic; i++)
+        for (int j = 0; j < 3; j++)
+            if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
+                needPeriodic = true;
+                break;
+            }
    // Loop over neighbors for this block.
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    float blockAtomR2[4];
    bool include[4];
-    fvec4 blockAtomDelta[4];
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
@@ -626,9 +594,10 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        // Compute the distances to the block atoms.
        bool any = false;
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        for (int j = 0; j < 4; j++) {
-            getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
+            include[j] = (((exclusions[i]>>j)&1) == 0 && r2[j] < cutoffDistance*cutoffDistance);
-            include[j] = (((exclusions[i]>>j)&1) == 0 && blockAtomR2[j] < cutoffDistance*cutoffDistance);
            any |= include[j];
        }
        if (!any)
@@ -636,7 +605,6 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        // Compute the interactions.
-        fvec4 r2(blockAtomR2);
        fvec4 r = sqrt(r2);
        fvec4 inverseR = fvec4(1.0f)/r;
        fvec4 switchValue(1.0f), switchDeriv(0.0f);
@@ -671,12 +639,13 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        // Accumulate forces.
+        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
+        transpose(result[0], result[1], result[2], result[3]);
        fvec4 atomForce(forces+4*atom);
        for (int j = 0; j < 4; j++) {
            if (include[j]) {
-                fvec4 result = blockAtomDelta[j]*dEdR[j];
+                blockAtomForce[j] += result[j];
-                blockAtomForce[j] += result;
+                atomForce -= result[j];
-                atomForce -= result;
            }
        }
        atomForce.store(forces+4*atom);
@@ -697,6 +666,18 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
    r2 = dot3(deltaR, deltaR);
 }
+void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
 fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
@@ -720,7 +701,7 @@ fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    coeff[0] = 1.0f-coeff[1];
    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    _MM_TRANSPOSE4_PS(coeff[0], coeff[1], coeff[2], coeff[3]);
+    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
    for (int i = 0; i < 4; i++) {
        if (index[i] < NUM_TABLE_POINTS)
            y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));

--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -34,6 +34,7 @@
 */
 #include "openmm/internal/AssertionUtilities.h"
+#include "openmm/internal/ThreadPool.h"
 #include "CpuNeighborList.h"
 #include "CpuPlatform.h"
 #include "sfmt/SFMT.h"
@@ -63,8 +64,9 @@ void testNeighborList(bool periodic) {
            exclusions[i-j].insert(i);
        }
    }
+    ThreadPool threads;
    CpuNeighborList neighborList;
-    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff);
+    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads);
    // Convert the neighbor list to a set for faster lookup.

--- a/platforms/cpu/tests/TestCpuNonbondedForce.cpp
+++ b/platforms/cpu/tests/TestCpuNonbondedForce.cpp
@@ -397,44 +397,44 @@ void testLargeSystem() {
    system.addForce(bonds);
    VerletIntegrator integrator1(0.01);
    VerletIntegrator integrator2(0.01);
-    Context cuContext(system, integrator1, platform);
+    Context cpuContext(system, integrator1, platform);
    Context referenceContext(system, integrator2, reference);
-    cuContext.setPositions(positions);
+    cpuContext.setPositions(positions);
-    cuContext.setVelocities(velocities);
+    cpuContext.setVelocities(velocities);
    referenceContext.setPositions(positions);
    referenceContext.setVelocities(velocities);
-    State cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
+    State cpuState = cpuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    State referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++) {
-        ASSERT_EQUAL_VEC(cuState.getPositions()[i], referenceState.getPositions()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getPositions()[i], referenceState.getPositions()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
    }
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
    // Now do the same thing with periodic boundary conditions.
    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
-    cuContext.reinitialize();
+    cpuContext.reinitialize();
    referenceContext.reinitialize();
-    cuContext.setPositions(positions);
+    cpuContext.setPositions(positions);
-    cuContext.setVelocities(velocities);
+    cpuContext.setVelocities(velocities);
    referenceContext.setPositions(positions);
    referenceContext.setVelocities(velocities);
-    cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
+    cpuState = cpuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++) {
-        double dx = cuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
+        double dx = cpuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
-        double dy = cuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
+        double dy = cpuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
-        double dz = cuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
+        double dz = cpuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);
-        ASSERT_EQUAL_VEC(cuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
    }
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }
 void testDispersionCorrection() {
@@ -542,15 +542,15 @@ void testChangingParameters() {
    VerletIntegrator integrator1(0.01);
    VerletIntegrator integrator2(0.01);
-    Context cuContext(system, integrator1, platform);
+    Context cpuContext(system, integrator1, platform);
    Context referenceContext(system, integrator2, reference);
-    cuContext.setPositions(positions);
+    cpuContext.setPositions(positions);
    referenceContext.setPositions(positions);
-    State cuState = cuContext.getState(State::Forces | State::Energy);
+    State cpuState = cpuContext.getState(State::Forces | State::Energy);
    State referenceState = referenceContext.getState(State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++)
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
    // Now modify parameters and see if they still agree.
@@ -559,13 +559,13 @@ void testChangingParameters() {
        nonbonded->getParticleParameters(i, charge, sigma, epsilon);
        nonbonded->setParticleParameters(i, 1.5*charge, 1.1*sigma, 1.7*epsilon);
    }
-    nonbonded->updateParametersInContext(cuContext);
+    nonbonded->updateParametersInContext(cpuContext);
    nonbonded->updateParametersInContext(referenceContext);
-    cuState = cuContext.getState(State::Forces | State::Energy);
+    cpuState = cpuContext.getState(State::Forces | State::Energy);
    referenceState = referenceContext.getState(State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++)
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }
 void testSwitchingFunction(NonbondedForce::NonbondedMethod method) {

--- a/platforms/reference/include/ReferencePME.h
+++ b/platforms/reference/include/ReferencePME.h
@@ -75,9 +75,9 @@ pme_init(pme_t *       ppme,
 */
 int
 pme_exec(pme_t       pme,
-         std::vector<OpenMM::RealVec>& atomCoordinates,
+         const std::vector<OpenMM::RealVec>& atomCoordinates,
         std::vector<OpenMM::RealVec>& forces,
-         std::vector<RealOpenMM>& charges,
+         const std::vector<RealOpenMM>& charges,
         const RealOpenMM  periodicBoxSize[3],
         RealOpenMM *    energy,
         RealOpenMM      pme_virial[3][3]);

--- a/platforms/reference/src/SimTKReference/ReferencePME.cpp
+++ b/platforms/reference/src/SimTKReference/ReferencePME.cpp
@@ -195,7 +195,7 @@ pme_calculate_bsplines_moduli(pme_t pme)
 static void
 pme_update_grid_index_and_fraction(pme_t    pme,
-                                   vector<RealVec>& atomCoordinates,
+                                   const vector<RealVec>& atomCoordinates,
                                   const RealOpenMM   periodicBoxSize[3])
 {
    int    i;
@@ -317,7 +317,7 @@ pme_update_bsplines(pme_t    pme)
 static void
-pme_grid_spread_charge(pme_t      pme, vector<RealOpenMM>& charges)
+pme_grid_spread_charge(pme_t pme, const vector<RealOpenMM>& charges)
 {
    int       order;
    int       i;
@@ -521,7 +521,7 @@ pme_reciprocal_convolution(pme_t     pme,
 static void
 pme_grid_interpolate_force(pme_t pme,
                           const RealOpenMM periodicBoxSize[3],
-                           vector<RealOpenMM>& charges,
+                           const vector<RealOpenMM>& charges,
                           vector<RealVec>& forces)
 {
    int       i;
@@ -666,11 +666,11 @@ pme_init(pme_t *       ppme,
 int pme_exec(pme_t       pme,
-             vector<RealVec>&   atomCoordinates,
+             const vector<RealVec>& atomCoordinates,
             vector<RealVec>& forces,
-             vector<RealOpenMM>& charges,
+             const vector<RealOpenMM>& charges,
             const RealOpenMM periodicBoxSize[3],
-             RealOpenMM *    energy,
+             RealOpenMM* energy,
             RealOpenMM pme_virial[3][3])
 {
    /* Routine is called with coordinates in x, a box, and charges in q */

--- a/wrappers/python/simtk/openmm/app/gromacstopfile.py
+++ b/wrappers/python/simtk/openmm/app/gromacstopfile.py
@@ -40,7 +40,7 @@ import simtk.unit as unit
 import simtk.openmm as mm
 import math
 import os
-import distutils
+import distutils.spawn
 HBonds = ff.HBonds
 AllBonds = ff.AllBonds
@@ -358,7 +358,7 @@ class GromacsTopFile(object):
            raise ValueError('Unsupported function type in [ cmaptypes ] line: '+line);
        self._cmapTypes[tuple(fields[:5])] = fields
-    def __init__(self, file, unitCellDimensions=None, includeDir=None, defines={}):
+    def __init__(self, file, unitCellDimensions=None, includeDir=None, defines=None):
        """Load a top file.
        Parameters:
@@ -368,12 +368,18 @@ class GromacsTopFile(object):
           included from the top file. If not specified, we will attempt to locate a gromacs
           installation on your system. When gromacs is installed in /usr/local, this will resolve
           to  /usr/local/gromacs/share/gromacs/top
-         - defines (map={}) preprocessor definitions that should be predefined when parsing the file
+         - defines (dict={}) preprocessor definitions that should be predefined when parsing the file
         """
        if includeDir is None:
            includeDir = _defaultGromacsIncludeDir()
        self._includeDirs = (os.path.dirname(file), includeDir)
-        self._defines = defines
+        # Most of the gromacs water itp files for different forcefields,
+        # unless the preprocessor #define FLEXIBLE is given, don't define
+        # bonds between the water hydrogen and oxygens, but only give the
+        # constraint distances and exclusions.
+        self._defines = {'FLEXIBLE': True}
+        if defines is not None:
+            self._defines.update(defines)
        # Parse the file.

--- a/wrappers/python/src/swig_doxygen/doxygen/Doxyfile.in
+++ b/wrappers/python/src/swig_doxygen/doxygen/Doxyfile.in
@@ -616,6 +616,7 @@ EXCLUDE_SYMLINKS       = NO
 EXCLUDE_PATTERNS       = */tests/* \
                         */openmmapi/src/* \
+                         */internal/* \
                         */.svn/* \
                         *OpenMMFortranModule.f90 \
                         *OpenMMCWrapper.h

--- a/wrappers/python/src/swig_doxygen/swigInputBuilder.py
+++ b/wrappers/python/src/swig_doxygen/swigInputBuilder.py
-#!/bin/env python
+#!/usr/bin/env python
 #
 #

--- a/wrappers/python/src/swig_doxygen/swigInputConfig.py
+++ b/wrappers/python/src/swig_doxygen/swigInputConfig.py
@@ -14,20 +14,7 @@ DOC_STRINGS = {("Context", "setPositions") :
 # Do not generate wrappers for the following methods.
 # Indexed by (className, [methodName [, numParams]])
 SKIP_METHODS = [('State',),
-                ('Stream',),
                ('Vec3',),
-                ('AmoebaGeneralizedKirkwoodForceImpl',),
-                ('AmoebaAngleForceImpl',),
-                ('AmoebaBondForceImpl',),
-                ('AmoebaInPlaneAngleForceImpl',),
-                ('AmoebaMultipoleForceImpl',),
-                ('AmoebaOutOfPlaneBendForceImpl',),
-                ('AmoebaPiTorsionForceImpl',),
-                ('AmoebaStretchBendForceImpl',),
-                ('AmoebaTorsionTorsionForceImpl',),
-                ('AmoebaVdwForceImpl',),
-                ('AmoebaWcaDispersionForceImpl',),
-                ('AndersenThermostatImpl',),
                ('AngleInfo',),
                ('ApplyAndersenThermostatKernel',),
                ('ApplyConstraintsKernel',),
@@ -63,30 +50,14 @@ SKIP_METHODS = [('State',),
                ('CalcNonbondedForceKernel',),
                ('CalcPeriodicTorsionForceKernel',),
                ('CalcRBTorsionForceKernel',),
-                ('CMAPTorsionForceImpl',),
-                ('CMMotionRemoverImpl',),
                ('ComputationInfo',),
                ('ConstraintInfo',),
-                ('ContextImpl',),
                ('CudaKernelFactory',),
                ('CudaStreamFactory',),
-                ('CustomAngleForceImpl',),
-                ('CustomBondForceImpl',),
-                ('CustomCompoundBondForceImpl',),
-                ('CustomExternalForceImpl',),
-                ('CustomGBForceImpl',),
-                ('CustomHbondForceImpl',),
-                ('CustomNonbondedForceImpl',),
-                ('CustomTorsionForceImpl',),
                ('ExceptionInfo',),
                ('ExclusionInfo',),
-                ('ForceImpl',),
                ('FunctionInfo',),
-                ('GBSAOBCForceImpl',),
-                ('GBVIForceImpl',),
                ('GlobalParameterInfo',),
-                ('HarmonicAngleForceImpl',),
-                ('HarmonicBondForceImpl',),
                ('InitializeForcesKernel',),
                ('IntegrateBrownianStepKernel',),
                ('IntegrateLangevinStepKernel',),
@@ -97,24 +68,18 @@ SKIP_METHODS = [('State',),
                ('Kernel',),
                ('KernelFactory',),
                ('KernelImpl',),
-                ('MonteCarloBarostatImpl',),
-                ('MonteCarloAnisotropicBarostatImpl',),
                ('MultipoleInfo',),
-                ('NonbondedForceImpl',),
                ('OutOfPlaneBendInfo',),
                ('ParameterInfo',),
                ('ParticleInfo',),
-                ('PeriodicTorsionForceImpl',),
                ('PeriodicTorsionInfo',),
                ('PerParticleParameterInfo',),
                ('PiTorsionInfo',),
                ('PlatformData',),
-                ('RBTorsionForceImpl',),
                ('RBTorsionInfo',),
                ('RemoveCMMotionKernel',),
                ('SplineFitter',),
                ('StreamFactory',),
-                ('StreamImpl',),
                ('StretchBendInfo',),
                ('TorsionInfo',),
                ('TorsionTorsionGridInfo',),
@@ -139,14 +104,11 @@ SKIP_METHODS = [('State',),
                ('Platform', 'registerKernelFactory'),
                ('IntegrateRPMDStepKernel',),
                ('RPMDIntegrator',  'getState'),
-                ('DrudeForceImpl',),
                ('CalcDrudeForceKernel',),
                ('IntegrateDrudeLangevinStepKernel',),
                ('IntegrateDrudeSCFStepKernel',),
                ('XmlSerializer',  'serialize'),
                ('XmlSerializer',  'deserialize'),
-                ('fvec4',),
-                ('ivec4',),
 ]
 # The build script assumes method args that are non-const references are
@@ -161,6 +123,8 @@ NO_OUTPUT_ARGS = [('LocalEnergyMinimizer', 'minimize', 'context'),
                  ('AmoebaMultipoleForce', 'addParticle', 'molecularDipole'),
                  ('AmoebaMultipoleForce', 'addParticle', 'molecularQuadrupole'),
                  ('AmoebaMultipoleForce', 'setCovalentMap', 'covalentAtoms'),
+                  ('AmoebaMultipoleForce', 'getElectrostaticPotential', 'context'),
+                  ('AmoebaMultipoleForce', 'getInducedDipoles', 'context'),
 ]
 # SWIG assumes the target language shadow class owns the C++ class
@@ -285,6 +249,7 @@ UNITS = {
 #("AmoebaMultipoleForce",                 "getElectrostaticPotential")                     :  ( None, ('unit.kilojoule_per_mole')),
 #("AmoebaMultipoleForce",                 "getElectrostaticPotential")                     :  ( ('unit.kilojoule_per_mole'), ()),
 ("AmoebaMultipoleForce",                 "getElectrostaticPotential")                     :  ( None, ()),
+("AmoebaMultipoleForce",                 "getInducedDipoles")                             :  ( None, ()),
 ("AmoebaMultipoleForce",                 "getSystemMultipoleMoments")                     :  ( None, ()),
 ("AmoebaOutOfPlaneBendForce",            "getNumOutOfPlaneBends")                         :  ( None, ()),

--- a/wrappers/python/src/swig_doxygen/swig_lib/python/typemaps.i
+++ b/wrappers/python/src/swig_doxygen/swig_lib/python/typemaps.i
 /* Convert python list of tuples to C++ std::vector of Vec3 objects */
-%typemap(in) std::vector<Vec3>& (std::vector<OpenMM::Vec3> vVec) {
+%typemap(in) const std::vector<Vec3>& (std::vector<OpenMM::Vec3> vVec) {
  // typemap -- %typemap(in) std::vector<Vec3>& (std::vector<OpenMM::Vec3> vVec)
  int i, pLength, itemLength;
  double x, y, z;
@@ -34,6 +34,32 @@
  $1 = &vVec;
 }
+/* The following two typemaps cause a non-const vector<Vec3>& to become a return value. */
+%typemap(in, numinputs=0) std::vector<Vec3>& (std::vector<Vec3> temp) {
+    $1 = &temp;
+}
+%typemap(argout) std::vector<Vec3>& {
+    int i, n;
+    PyObject *pyList;
+    n=(*$1).size(); 
+    pyList=PyList_New(n);
+    PyObject* mm = PyImport_AddModule("simtk.openmm");
+    PyObject* vec3 = PyObject_GetAttrString(mm, "Vec3");
+    for (i=0; i<n; i++) {
+        OpenMM::Vec3& v = (*$1).at(i);
+        PyObject* args = Py_BuildValue("(d,d,d)", v[0], v[1], v[2]);
+        PyObject* pyVec = PyObject_CallObject(vec3, args);
+        Py_DECREF(args);
+        PyList_SET_ITEM(pyList, i, pyVec);
+    }
+    $result = pyList;
+}
+/* const vector<Vec3> should NOT become an output. */
+%typemap(argout) const std::vector<Vec3>& {
+}
 /* Convert python tuple to C++ Vec3 object*/
 %typemap(typecheck) Vec3 {