Merge branch 'master' of github.com:SimTk/openmm

59854c5e · leeping · 8167c79b · 8e11ed9c · 59854c5e · 59854c5e
Commit 59854c5e authored Oct 30, 2013 by leeping
19 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,9 +296,9 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
 # On Linux need to link to libdl
 FIND_LIBRARY(DL_LIBRARY dl)
 IF(DL_LIBRARY)
-  TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY})
+  TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
  IF(OPENMM_BUILD_STATIC_LIB)
-    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY})
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
  ENDIF(OPENMM_BUILD_STATIC_LIB)
 ENDIF(DL_LIBRARY)
 IF(WIN32)

--- a/openmmapi/include/openmm/internal/ThreadPool.h
+++ b/openmmapi/include/openmm/internal/ThreadPool.h
+#ifndef OPENMM_THREAD_POOL_H_
+#define OPENMM_THREAD_POOL_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "windowsExport.h"
+#include <pthread.h>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * A ThreadPool creates a set of worker threads that can be used to execute tasks in parallel.
+ * After creating a ThreadPool, call execute() to start a task running then waitForThreads()
+ * to block until all threads have finished.  You also can synchronize the threads in the middle
+ * of the task by having them call syncThreads().  In this case, the parent thread should call
+ * waitForThreads() an additional time; each call waits until all worker threads have reached the
+ * next syncThreads(), and the final call waits until they exit from the Task's execute() method.
+ * After calling waitForThreads() to block at a synchronization point, the parent thread should
+ * call resumeThreads() to instruct the worker threads to resume.
+ */
+class OPENMM_EXPORT ThreadPool {
+public:
+    class Task;
+    class ThreadData;
+    ThreadPool();
+    ~ThreadPool();
+    /**
+     * Get the number of worker threads in the pool.
+     */
+    int getNumThreads() const;
+    /**
+     * Execute a Task in parallel on the worker threads.
+     */
+    void execute(Task& task);
+    /**
+     * This is called by the worker threads to block until all threads have reached the same point
+     * and the master thread instructs them to continue by calling resumeThreads().
+     */
+    void syncThreads();
+    /**
+     * This is called by the master thread to wait until all threads have completed the Task.  Alternatively,
+     * if the threads call syncThreads(), this blocks until all threads have reached the synchronization point.
+     */
+    void waitForThreads();
+    /**
+     * Instruct the threads to resume running after blocking at a synchronization point.
+     */
+    void resumeThreads();
+private:
+    bool isDeleted;
+    int numThreads, waitCount;
+    std::vector<pthread_t> thread;
+    std::vector<ThreadData*> threadData;
+    pthread_cond_t startCondition, endCondition;
+    pthread_mutex_t lock;
+};
+
+/**
+ * This defines a task that can be executed in parallel by the worker threads.
+ */
+class OPENMM_EXPORT ThreadPool::Task {
+public:
+    /**
+     * Execute the task on each thread.
+     * 
+     * @param pool         the ThreadPool being used to execute the task
+     * @param threadIndex  the index of the thread invoking this method
+     */
+    virtual void execute(ThreadPool& pool, int threadIndex) = 0;
+};
+
+} // namespace OpenMM
+
+#endif // OPENMM_THREAD_POOL_H_
--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -212,6 +212,10 @@ static inline float dot4(fvec4 v1, fvec4 v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
 }

+static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
+    _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
+}
+
 // Functions that operate on ivec4s.

 static inline ivec4 min(ivec4 v1, ivec4 v2) {

--- a/openmmapi/src/ThreadPool.cpp
+++ b/openmmapi/src/ThreadPool.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/internal/ThreadPool.h"
+#include "openmm/internal/hardware.h"
+
+using namespace std;
+
+namespace OpenMM {
+
+class ThreadPool::ThreadData {
+public:
+    ThreadData(ThreadPool& owner, int index) : owner(owner), index(index), isDeleted(false) {
+    }
+    ThreadPool& owner;
+    int index;
+    bool isDeleted;
+    Task* currentTask;
+};
+
+static void* threadBody(void* args) {
+    ThreadPool::ThreadData& data = *reinterpret_cast<ThreadPool::ThreadData*>(args);
+    while (true) {
+        // Wait for the signal to start running.
+        
+        data.owner.syncThreads();
+        if (data.isDeleted)
+            break;
+        data.currentTask->execute(data.owner, data.index);
+    }
+    delete &data;
+    return 0;
+}
+
+ThreadPool::ThreadPool() {
+    numThreads = getNumProcessors();
+    pthread_cond_init(&startCondition, NULL);
+    pthread_cond_init(&endCondition, NULL);
+    pthread_mutex_init(&lock, NULL);
+    thread.resize(numThreads);
+    pthread_mutex_lock(&lock);
+    waitCount = 0;
+    for (int i = 0; i < numThreads; i++) {
+        ThreadData* data = new ThreadData(*this, i);
+        data->isDeleted = false;
+        threadData.push_back(data);
+        pthread_create(&thread[i], NULL, threadBody, data);
+    }
+    while (waitCount < numThreads)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+ThreadPool::~ThreadPool() {
+    for (int i = 0; i < (int) threadData.size(); i++)
+        threadData[i]->isDeleted = true;
+    pthread_mutex_lock(&lock);
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+    for (int i = 0; i < (int) thread.size(); i++)
+        pthread_join(thread[i], NULL);
+    pthread_mutex_destroy(&lock);
+    pthread_cond_destroy(&startCondition);
+    pthread_cond_destroy(&endCondition);
+}
+
+int ThreadPool::getNumThreads() const {
+    return numThreads;
+}
+
+void ThreadPool::execute(Task& task) {
+    for (int i = 0; i < (int) threadData.size(); i++)
+        threadData[i]->currentTask = &task;
+    resumeThreads();
+}
+
+void ThreadPool::syncThreads() {
+    pthread_mutex_lock(&lock);
+    waitCount++;
+    pthread_cond_signal(&endCondition);
+    pthread_cond_wait(&startCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+void ThreadPool::waitForThreads() {
+    pthread_mutex_lock(&lock);
+    while (waitCount < numThreads)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+void ThreadPool::resumeThreads() {
+    pthread_mutex_lock(&lock);
+    waitCount = 0;
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+}
+
+} // namespace OpenMM
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -37,6 +37,7 @@
 #include "CpuNonbondedForce.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"

 namespace OpenMM {

@@ -90,6 +91,7 @@ private:
    NonbondedMethod nonbondedMethod;
    CpuNeighborList neighborList;
    CpuNonbondedForce nonbonded;
+    ThreadPool threads;
    Kernel optimizedPme;
 };


--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
 #ifndef OPENMM_CPU_NEIGHBORLIST_H_
 #define OPENMM_CPU_NEIGHBORLIST_H_

+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
 #include "windowsExportCpu.h"
-#include <pthread.h>
+#include "openmm/internal/ThreadPool.h"
 #include <set>
 #include <utility>
 #include <vector>
@@ -11,13 +42,12 @@ namespace OpenMM {
    
 class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
-    class ThreadData;
+    class ThreadTask;
    class Voxels;
    static const int BlockSize;
    CpuNeighborList();
-    ~CpuNeighborList();
    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
-            const float* periodicBoxSize, bool usePeriodic, float maxDistance);
+            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    const std::vector<int>& getSortedAtoms() const;
    const std::vector<int>& getBlockNeighbors(int blockIndex) const;
@@ -25,25 +55,12 @@ public:
    /**
     * This routine contains the code executed by each thread.
     */
+    void threadComputeNeighborList(ThreadPool& threads, int threadIndex);
    void runThread(int index);
 private:
-    /**
-     * This is called by the worker threads to wait until the master thread instructs them to advance.
-     */
-    void threadWait();
-    /**
-     * This is called by the master thread to instruct all the worker threads to advance.
-     */
-    void advanceThreads();
-    bool isDeleted;
-    int numThreads, waitCount;
    std::vector<int> sortedAtoms;
    std::vector<std::vector<int> > blockNeighbors;
    std::vector<std::vector<char> > blockExclusions;
-    std::vector<pthread_t> thread;
-    std::vector<ThreadData*> threadData;
-    pthread_cond_t startCondition, endCondition;
-    pthread_mutex_t lock;
    // The following variables are used to make information accessible to the individual threads.
    float minx, maxx, miny, maxy, minz, maxz;
    std::vector<std::pair<int, int> > atomBins;
@@ -58,4 +75,4 @@ private:

 } // namespace OpenMM

-#endif // OPENMM_REFERENCE_NEIGHBORLIST_H_
+#endif // OPENMM_CPU_NEIGHBORLIST_H_
--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -27,8 +27,8 @@

 #include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
+#include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
-#include <pthread.h>
 #include <set>
 #include <utility>
 #include <vector>
@@ -38,7 +38,7 @@ namespace OpenMM {

 class CpuNonbondedForce {
    public:
-        class ThreadData;
+        class ComputeDirectTask;

      /**---------------------------------------------------------------------------------------
      
@@ -48,14 +48,6 @@ class CpuNonbondedForce {

       CpuNonbondedForce();

-      /**---------------------------------------------------------------------------------------
-      
-         Destructor
-      
-         --------------------------------------------------------------------------------------- */
-
-       ~CpuNonbondedForce();
-
      /**---------------------------------------------------------------------------------------
      
         Set the force to use a cutoff.
@@ -130,7 +122,7 @@ class CpuNonbondedForce {
            
         --------------------------------------------------------------------------------------- */
          
-      void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<RealVec>& atomCoordinates,
+      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
                            std::vector<RealVec>& forces, float* totalEnergy) const;
      
@@ -140,21 +132,23 @@ class CpuNonbondedForce {
      
         @param numberOfAtoms    number of atoms
         @param posq             atom coordinates and charges
+         @param atomCoordinates  atom coordinates (periodic boundary conditions not applied)
         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
         @param exclusions       atom exclusion indices
                                 exclusions[atomIndex] contains the list of exclusions for that atom
         @param forces           force array (forces added)
         @param totalEnergy      total energy
+         @param threads          the thread pool to use
      
         --------------------------------------------------------------------------------------- */
          
-      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, float* forces, float* totalEnergy);
+      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
+            const std::vector<std::set<int> >& exclusions, float* forces, float* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
     */
-    void runThread(int index, std::vector<float>& threadForce, double& threadEnergy);
+    void threadComputeDirect(ThreadPool& threads, int threadIndex);

 private:
        bool cutoff;
@@ -171,15 +165,12 @@ private:
        int meshDim[3];
        std::vector<float> ewaldScaleTable;
        float ewaldDX, ewaldDXInv;
-        bool isDeleted;
-        int numThreads, waitCount;
-        std::vector<pthread_t> thread;
-        std::vector<ThreadData*> threadData;
-        pthread_cond_t startCondition, endCondition;
-        pthread_mutex_t lock;
+        std::vector<std::vector<float> > threadForce;
+        std::vector<double> threadEnergy;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
        float* posq;
+        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
        std::set<int> const* exclusions;
        bool includeEnergy;
@@ -230,6 +221,12 @@ private:
       */
      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+
      /**
       * Compute a fast approximation to erfc(x).
       */

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -203,11 +203,11 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    
    // Convert the positions to single precision.
    
-    if (periodic)
+    if (periodic || ewald || pme)
        for (int i = 0; i < numParticles; i++)
            for (int j = 0; j < 3; j++) {
                RealOpenMM x = posData[i][j];
-                double base = floor(x/boxSize[j]+0.5)*boxSize[j];
+                double base = floor(x/boxSize[j])*boxSize[j];
                posq[4*i+j] = (float) (x-base);
            }
    else
@@ -244,6 +244,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo

            int numMoved = moved.size();
            double cutoff2 = nonbondedCutoff*nonbondedCutoff;
+            double paddedCutoff2 = (nonbondedCutoff+padding)*(nonbondedCutoff+padding);
            for (int i = 1; i < numMoved && !needRecompute; i++)
                for (int j = 0; j < i; j++) {
                    RealVec delta = posData[moved[i]]-posData[moved[j]];
@@ -251,7 +252,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                        // These particles should interact.  See if they are in the neighbor list.
                        
                        RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
-                        if (oldDelta.dot(oldDelta) > cutoff2) {
+                        if (oldDelta.dot(oldDelta) > paddedCutoff2) {
                            needRecompute = true;
                            break;
                        }
@@ -259,7 +260,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                }
        }
        if (needRecompute) {
-            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
+            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding, threads);
            lastPositions = posData;
        }
        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
@@ -278,7 +279,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        nonbonded.setUseSwitchingFunction(switchingDistance);
    float nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded.calculateDirectIxn(numParticles, &posq[0], particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL);
+        nonbonded.calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL, threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &forces[0], numParticles);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
 #include "CpuNeighborList.h"
 #include "openmm/internal/hardware.h"
 #include "openmm/internal/vectorize.h"
@@ -23,8 +54,6 @@ public:
    int y;
 };

-typedef pair<const float*, int> VoxelItem;
-
 /**
 * This data structure organizes the particles spatially.  It divides them into bins along the x and y axes,
 * then sorts each bin along the z axis so ranges can be identified quickly with a binary search.
@@ -60,7 +89,7 @@ public:
     */
    void insert(const int& atom, const float* location) {
        VoxelIndex voxelIndex = getVoxelIndex(location);
-        bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], VoxelItem(location, atom)));
+        bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], atom));
    }
    
    /**
@@ -76,7 +105,7 @@ public:
     * Find the index of the first particle in voxel (x,y) whose z coordinate in >= the specified value.
     */
    int findLowerBound(int x, int y, double z) const {
-        const vector<pair<float, VoxelItem> >& bin = bins[x][y];
+        const vector<pair<float, int> >& bin = bins[x][y];
        int lower = 0;
        int upper = bin.size();
        while (lower < upper) {
@@ -93,7 +122,7 @@ public:
     * Find the index of the first particle in voxel (x,y) whose z coordinate in greater than the specified value.
     */
    int findUpperBound(int x, int y, double z) const {
-        const vector<pair<float, VoxelItem> >& bin = bins[x][y];
+        const vector<pair<float, int> >& bin = bins[x][y];
        int lower = 0;
        int upper = bin.size();
        while (lower < upper) {
@@ -148,14 +177,12 @@ public:
        if (usePeriodic) {
            endx = min(endx, centerVoxelIndex.x-dIndexX+nx-1);
            endy = min(endy, centerVoxelIndex.y-dIndexY+ny-1);
-            numRanges = 2;
        }
        else {
            startx = max(startx, 0);
            starty = max(starty, 0);
            endx = min(endx, nx-1);
            endy = min(endy, ny-1);
-            numRanges = 1;
        }
        int lastSortedIndex = BlockSize*(blockIndex+1);
        VoxelIndex voxelIndex(0, 0);
@@ -175,10 +202,12 @@ public:
                
                float dz = maxDistance+blockWidth[2];
                dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
+                bool needPeriodic = (voxelIndex.x != x || voxelIndex.y != y || centerPos[2]-dz < 0.0f || centerPos[2]+dz > periodicBoxSize[2]);
                int rangeStart[2];
                int rangeEnd[2];
                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);
-                if (usePeriodic) {
+                if (needPeriodic) {
+                    numRanges = 2;
                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
                    if (rangeStart[0] > 0) {
                        rangeStart[1] = 0;
@@ -189,22 +218,24 @@ public:
                        rangeEnd[1] = bins[voxelIndex.x][voxelIndex.y].size();
                    }
                }
-                else
+                else {
+                    numRanges = 1;
                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
+                }
                
                // Loop over atoms and check to see if they are neighbors of this block.
                
                for (int range = 0; range < numRanges; range++) {
                    for (int item = rangeStart[range]; item < rangeEnd[range]; item++) {
-                        const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second.second;
+                        const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second;

                        // Avoid duplicate entries.
                        if (sortedIndex >= lastSortedIndex)
                            continue;
                        
-                        fvec4 atomPos(bins[voxelIndex.x][voxelIndex.y][item].second.first);
+                        fvec4 atomPos(atomLocations+4*sortedAtoms[sortedIndex]);
                        fvec4 delta = atomPos-centerPos;
-                        if (usePeriodic) {
+                        if (needPeriodic) {
                            fvec4 base = round(delta*invBoxSize)*boxSize;
                            delta = delta-base;
                        }
@@ -221,7 +252,7 @@ public:
                            for (int k = 0; k < (int) blockAtoms.size(); k++) {
                                fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
                                delta = atomPos-pos1;
-                                if (usePeriodic) {
+                                if (needPeriodic) {
                                    fvec4 base = round(delta*invBoxSize)*boxSize;
                                    delta = delta-base;
                                }
@@ -254,57 +285,24 @@ private:
    int nx, ny;
    const float* periodicBoxSize;
    const bool usePeriodic;
-    vector<vector<vector<pair<float, VoxelItem> > > > bins;
+    vector<vector<vector<pair<float, int> > > > bins;
 };

-class CpuNeighborList::ThreadData {
+class CpuNeighborList::ThreadTask : public ThreadPool::Task {
 public:
-    ThreadData(int index, CpuNeighborList& owner) : index(index), owner(owner) {
+    ThreadTask(CpuNeighborList& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeNeighborList(threads, threadIndex);
    }
-    int index;
    CpuNeighborList& owner;
 };

-static void* threadBody(void* args) {
-    CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args);
-    data.owner.runThread(data.index);
-    delete &data;
-    return 0;
-}
-
 CpuNeighborList::CpuNeighborList() {
-    isDeleted = false;
-    numThreads = getNumProcessors();
-    pthread_cond_init(&startCondition, NULL);
-    pthread_cond_init(&endCondition, NULL);
-    pthread_mutex_init(&lock, NULL);
-    thread.resize(numThreads);
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    for (int i = 0; i < numThreads; i++) {
-        ThreadData* data = new ThreadData(i, *this);
-        threadData.push_back(data);
-        pthread_create(&thread[i], NULL, threadBody, data);
-    }
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
-
-CpuNeighborList::~CpuNeighborList() {
-    isDeleted = true;
-    pthread_mutex_lock(&lock);
-    pthread_cond_broadcast(&startCondition);
-    pthread_mutex_unlock(&lock);
-    for (int i = 0; i < (int) thread.size(); i++)
-        pthread_join(thread[i], NULL);
-    pthread_mutex_destroy(&lock);
-    pthread_cond_destroy(&startCondition);
-    pthread_cond_destroy(&endCondition);
 }

 void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
-            const float* periodicBoxSize, bool usePeriodic, float maxDistance) {
+            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
@@ -338,8 +336,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    // Sort the atoms based on a Hilbert curve.
    
    atomBins.resize(numAtoms);
-    pthread_mutex_lock(&lock);
-    advanceThreads();
+    ThreadTask task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());

    // Build the voxel hash.
@@ -362,8 +361,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    
    // Signal the threads to start running and wait for them to finish.
    
-    advanceThreads();
-    pthread_mutex_unlock(&lock);
+    threads.resumeThreads();
+    threads.waitForThreads();
    
    // Add padding atoms to fill up the last block.
    
@@ -395,82 +394,55 @@ const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) con
    
 }

-void CpuNeighborList::runThread(int index) {
-    while (true) {
-        // Wait for the signal to start running.
-        
-        threadWait();
-        if (isDeleted)
-            break;
-        
-        // Compute the positions of atoms along the Hilbert curve.
+void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadIndex) {
+    // Compute the positions of atoms along the Hilbert curve.

-        float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
-        float invBinWidth = 1.0f/binWidth;
-        bitmask_t coords[3];
-        for (int i = index; i < numAtoms; i += numThreads) {
-            const float* pos = &atomLocations[4*i];
-            coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
-            coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
-            coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
-            int bin = (int) hilbert_c2i(3, 8, coords);
-            atomBins[i] = pair<int, int>(bin, i);
-        }
-        threadWait();
-        
-        // Compute this thread's subset of neighbors.
-        
-        int numBlocks = blockNeighbors.size();
-        vector<int> blockAtoms;
-        for (int i = index; i < numBlocks; i += numThreads) {
-            {
-            int firstIndex = BlockSize*i;
-            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
-            blockAtoms.resize(atomsInBlock);
-            for (int j = 0; j < atomsInBlock; j++)
-                blockAtoms[j] = sortedAtoms[firstIndex+j];
-            }
+    float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
+    float invBinWidth = 1.0f/binWidth;
+    bitmask_t coords[3];
+    int numThreads = threads.getNumThreads();
+    for (int i = threadIndex; i < numAtoms; i += numThreads) {
+        const float* pos = &atomLocations[4*i];
+        coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
+        coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
+        coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
+        int bin = (int) hilbert_c2i(3, 8, coords);
+        atomBins[i] = pair<int, int>(bin, i);
+    }
+    threads.syncThreads();

-                        
-            int firstIndex = BlockSize*i;
-            fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
-            fvec4 maxPos = minPos;
-            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
-            for (int j = 1; j < atomsInBlock; j++) {
-                fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
-                minPos = min(minPos, pos);
-                maxPos = max(maxPos, pos);
-            }
-            voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
-            
-            // Record the exclusions for this block.
-            
-            for (int j = 0; j < atomsInBlock; j++) {
-                const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
-                char mask = 1<<j;
-                for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
-                    int atomIndex = blockNeighbors[i][k];
-                    if (atomExclusions.find(atomIndex) != atomExclusions.end())
-                        blockExclusions[i][k] |= mask;
-                }
-            }
+    // Compute this thread's subset of neighbors.
+
+    int numBlocks = blockNeighbors.size();
+    vector<int> blockAtoms;
+    for (int i = threadIndex; i < numBlocks; i += numThreads) {
+        // Find the atoms in this block and compute their bounding box.
+        
+        int firstIndex = BlockSize*i;
+        int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
+        blockAtoms.resize(atomsInBlock);
+        for (int j = 0; j < atomsInBlock; j++)
+            blockAtoms[j] = sortedAtoms[firstIndex+j];
+        fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
+        fvec4 maxPos = minPos;
+        for (int j = 1; j < atomsInBlock; j++) {
+            fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
+            minPos = min(minPos, pos);
+            maxPos = max(maxPos, pos);
        }
-    }
-}
+        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);

-void CpuNeighborList::threadWait() {
-    pthread_mutex_lock(&lock);
-    waitCount++;
-    pthread_cond_signal(&endCondition);
-    pthread_cond_wait(&startCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
+        // Record the exclusions for this block.

-void CpuNeighborList::advanceThreads() {
-    waitCount = 0;
-    pthread_cond_broadcast(&startCondition);
-    while (waitCount < numThreads) {
-        pthread_cond_wait(&endCondition, &lock);
+        for (int j = 0; j < atomsInBlock; j++) {
+            const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
+            char mask = 1<<j;
+            for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
+                int atomIndex = blockNeighbors[i][k];
+                if (atomExclusions.find(atomIndex) != atomExclusions.end())
+                    blockExclusions[i][k] |= mask;
+            }
+        }
    }
 }


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -34,6 +34,7 @@
 */

 #include "openmm/internal/AssertionUtilities.h"
+#include "openmm/internal/ThreadPool.h"
 #include "CpuNeighborList.h"
 #include "CpuPlatform.h"
 #include "sfmt/SFMT.h"
@@ -63,8 +64,9 @@ void testNeighborList(bool periodic) {
            exclusions[i-j].insert(i);
        }
    }
+    ThreadPool threads;
    CpuNeighborList neighborList;
-    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff);
+    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads);
    
    // Convert the neighbor list to a set for faster lookup.
    

--- a/platforms/cpu/tests/TestCpuNonbondedForce.cpp
+++ b/platforms/cpu/tests/TestCpuNonbondedForce.cpp
@@ -397,44 +397,44 @@ void testLargeSystem() {
    system.addForce(bonds);
    VerletIntegrator integrator1(0.01);
    VerletIntegrator integrator2(0.01);
-    Context cuContext(system, integrator1, platform);
+    Context cpuContext(system, integrator1, platform);
    Context referenceContext(system, integrator2, reference);
-    cuContext.setPositions(positions);
-    cuContext.setVelocities(velocities);
+    cpuContext.setPositions(positions);
+    cpuContext.setVelocities(velocities);
    referenceContext.setPositions(positions);
    referenceContext.setVelocities(velocities);
-    State cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
+    State cpuState = cpuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    State referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++) {
-        ASSERT_EQUAL_VEC(cuState.getPositions()[i], referenceState.getPositions()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getPositions()[i], referenceState.getPositions()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
    }
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);

    // Now do the same thing with periodic boundary conditions.

    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
-    cuContext.reinitialize();
+    cpuContext.reinitialize();
    referenceContext.reinitialize();
-    cuContext.setPositions(positions);
-    cuContext.setVelocities(velocities);
+    cpuContext.setPositions(positions);
+    cpuContext.setVelocities(velocities);
    referenceContext.setPositions(positions);
    referenceContext.setVelocities(velocities);
-    cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
+    cpuState = cpuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++) {
-        double dx = cuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
-        double dy = cuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
-        double dz = cuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);
-        ASSERT_EQUAL_VEC(cuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
+        double dx = cpuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
+        double dy = cpuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
+        double dz = cpuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);
+        ASSERT_EQUAL_VEC(cpuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
    }
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }

 void testDispersionCorrection() {
@@ -542,15 +542,15 @@ void testChangingParameters() {
    
    VerletIntegrator integrator1(0.01);
    VerletIntegrator integrator2(0.01);
-    Context cuContext(system, integrator1, platform);
+    Context cpuContext(system, integrator1, platform);
    Context referenceContext(system, integrator2, reference);
-    cuContext.setPositions(positions);
+    cpuContext.setPositions(positions);
    referenceContext.setPositions(positions);
-    State cuState = cuContext.getState(State::Forces | State::Energy);
+    State cpuState = cpuContext.getState(State::Forces | State::Energy);
    State referenceState = referenceContext.getState(State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++)
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
    
    // Now modify parameters and see if they still agree.

@@ -559,13 +559,13 @@ void testChangingParameters() {
        nonbonded->getParticleParameters(i, charge, sigma, epsilon);
        nonbonded->setParticleParameters(i, 1.5*charge, 1.1*sigma, 1.7*epsilon);
    }
-    nonbonded->updateParametersInContext(cuContext);
+    nonbonded->updateParametersInContext(cpuContext);
    nonbonded->updateParametersInContext(referenceContext);
-    cuState = cuContext.getState(State::Forces | State::Energy);
+    cpuState = cpuContext.getState(State::Forces | State::Energy);
    referenceState = referenceContext.getState(State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++)
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }

 void testSwitchingFunction(NonbondedForce::NonbondedMethod method) {

--- a/platforms/reference/include/ReferencePME.h
+++ b/platforms/reference/include/ReferencePME.h
@@ -75,9 +75,9 @@ pme_init(pme_t *       ppme,
 */
 int
 pme_exec(pme_t       pme,
-         std::vector<OpenMM::RealVec>& atomCoordinates,
+         const std::vector<OpenMM::RealVec>& atomCoordinates,
         std::vector<OpenMM::RealVec>& forces,
-         std::vector<RealOpenMM>& charges,
+         const std::vector<RealOpenMM>& charges,
         const RealOpenMM  periodicBoxSize[3],
         RealOpenMM *    energy,
         RealOpenMM      pme_virial[3][3]);

--- a/platforms/reference/src/SimTKReference/ReferencePME.cpp
+++ b/platforms/reference/src/SimTKReference/ReferencePME.cpp
@@ -195,7 +195,7 @@ pme_calculate_bsplines_moduli(pme_t pme)

 static void
 pme_update_grid_index_and_fraction(pme_t    pme,
-                                   vector<RealVec>& atomCoordinates,
+                                   const vector<RealVec>& atomCoordinates,
                                   const RealOpenMM   periodicBoxSize[3])
 {
    int    i;
@@ -317,7 +317,7 @@ pme_update_bsplines(pme_t    pme)


 static void
-pme_grid_spread_charge(pme_t      pme, vector<RealOpenMM>& charges)
+pme_grid_spread_charge(pme_t pme, const vector<RealOpenMM>& charges)
 {
    int       order;
    int       i;
@@ -519,10 +519,10 @@ pme_reciprocal_convolution(pme_t     pme,


 static void
-pme_grid_interpolate_force(pme_t      pme,
-                           const RealOpenMM     periodicBoxSize[3],
-                           vector<RealOpenMM>& charges,
-                           vector<RealVec>&   forces)
+pme_grid_interpolate_force(pme_t pme,
+                           const RealOpenMM periodicBoxSize[3],
+                           const vector<RealOpenMM>& charges,
+                           vector<RealVec>& forces)
 {
    int       i;
    int       ix,iy,iz;
@@ -666,12 +666,12 @@ pme_init(pme_t *       ppme,


 int pme_exec(pme_t       pme,
-             vector<RealVec>&   atomCoordinates,
-             vector<RealVec>&   forces,
-             vector<RealOpenMM>& charges,
-             const RealOpenMM      periodicBoxSize[3],
-             RealOpenMM *    energy,
-             RealOpenMM      pme_virial[3][3])
+             const vector<RealVec>& atomCoordinates,
+             vector<RealVec>& forces,
+             const vector<RealOpenMM>& charges,
+             const RealOpenMM periodicBoxSize[3],
+             RealOpenMM* energy,
+             RealOpenMM pme_virial[3][3])
 {
    /* Routine is called with coordinates in x, a box, and charges in q */


--- a/wrappers/python/simtk/openmm/app/gromacstopfile.py
+++ b/wrappers/python/simtk/openmm/app/gromacstopfile.py
@@ -40,7 +40,7 @@ import simtk.unit as unit
 import simtk.openmm as mm
 import math
 import os
-import distutils
+import distutils.spawn

 HBonds = ff.HBonds
 AllBonds = ff.AllBonds
@@ -358,7 +358,7 @@ class GromacsTopFile(object):
            raise ValueError('Unsupported function type in [ cmaptypes ] line: '+line);
        self._cmapTypes[tuple(fields[:5])] = fields

-    def __init__(self, file, unitCellDimensions=None, includeDir=None, defines={}):
+    def __init__(self, file, unitCellDimensions=None, includeDir=None, defines=None):
        """Load a top file.

        Parameters:
@@ -368,12 +368,18 @@ class GromacsTopFile(object):
           included from the top file. If not specified, we will attempt to locate a gromacs
           installation on your system. When gromacs is installed in /usr/local, this will resolve
           to  /usr/local/gromacs/share/gromacs/top
-         - defines (map={}) preprocessor definitions that should be predefined when parsing the file
+         - defines (dict={}) preprocessor definitions that should be predefined when parsing the file
         """
        if includeDir is None:
            includeDir = _defaultGromacsIncludeDir()
        self._includeDirs = (os.path.dirname(file), includeDir)
-        self._defines = defines
+        # Most of the gromacs water itp files for different forcefields,
+        # unless the preprocessor #define FLEXIBLE is given, don't define
+        # bonds between the water hydrogen and oxygens, but only give the
+        # constraint distances and exclusions.
+        self._defines = {'FLEXIBLE': True}
+        if defines is not None:
+            self._defines.update(defines)

        # Parse the file.


--- a/wrappers/python/src/swig_doxygen/doxygen/Doxyfile.in
+++ b/wrappers/python/src/swig_doxygen/doxygen/Doxyfile.in
@@ -616,6 +616,7 @@ EXCLUDE_SYMLINKS       = NO

 EXCLUDE_PATTERNS       = */tests/* \
                         */openmmapi/src/* \
+                         */internal/* \
                         */.svn/* \
                         *OpenMMFortranModule.f90 \
                         *OpenMMCWrapper.h

--- a/wrappers/python/src/swig_doxygen/swigInputBuilder.py
+++ b/wrappers/python/src/swig_doxygen/swigInputBuilder.py
-#!/bin/env python
+#!/usr/bin/env python
 #
 #
 

--- a/wrappers/python/src/swig_doxygen/swigInputConfig.py
+++ b/wrappers/python/src/swig_doxygen/swigInputConfig.py
@@ -14,20 +14,7 @@ DOC_STRINGS = {("Context", "setPositions") :
 # Do not generate wrappers for the following methods.
 # Indexed by (className, [methodName [, numParams]])
 SKIP_METHODS = [('State',),
-                ('Stream',),
                ('Vec3',),
-                ('AmoebaGeneralizedKirkwoodForceImpl',),
-                ('AmoebaAngleForceImpl',),
-                ('AmoebaBondForceImpl',),
-                ('AmoebaInPlaneAngleForceImpl',),
-                ('AmoebaMultipoleForceImpl',),
-                ('AmoebaOutOfPlaneBendForceImpl',),
-                ('AmoebaPiTorsionForceImpl',),
-                ('AmoebaStretchBendForceImpl',),
-                ('AmoebaTorsionTorsionForceImpl',),
-                ('AmoebaVdwForceImpl',),
-                ('AmoebaWcaDispersionForceImpl',),
-                ('AndersenThermostatImpl',),
                ('AngleInfo',),
                ('ApplyAndersenThermostatKernel',),
                ('ApplyConstraintsKernel',),
@@ -63,30 +50,14 @@ SKIP_METHODS = [('State',),
                ('CalcNonbondedForceKernel',),
                ('CalcPeriodicTorsionForceKernel',),
                ('CalcRBTorsionForceKernel',),
-                ('CMAPTorsionForceImpl',),
-                ('CMMotionRemoverImpl',),
                ('ComputationInfo',),
                ('ConstraintInfo',),
-                ('ContextImpl',),
                ('CudaKernelFactory',),
                ('CudaStreamFactory',),
-                ('CustomAngleForceImpl',),
-                ('CustomBondForceImpl',),
-                ('CustomCompoundBondForceImpl',),
-                ('CustomExternalForceImpl',),
-                ('CustomGBForceImpl',),
-                ('CustomHbondForceImpl',),
-                ('CustomNonbondedForceImpl',),
-                ('CustomTorsionForceImpl',),
                ('ExceptionInfo',),
                ('ExclusionInfo',),
-                ('ForceImpl',),
                ('FunctionInfo',),
-                ('GBSAOBCForceImpl',),
-                ('GBVIForceImpl',),
                ('GlobalParameterInfo',),
-                ('HarmonicAngleForceImpl',),
-                ('HarmonicBondForceImpl',),
                ('InitializeForcesKernel',),
                ('IntegrateBrownianStepKernel',),
                ('IntegrateLangevinStepKernel',),
@@ -97,24 +68,18 @@ SKIP_METHODS = [('State',),
                ('Kernel',),
                ('KernelFactory',),
                ('KernelImpl',),
-                ('MonteCarloBarostatImpl',),
-                ('MonteCarloAnisotropicBarostatImpl',),
                ('MultipoleInfo',),
-                ('NonbondedForceImpl',),
                ('OutOfPlaneBendInfo',),
                ('ParameterInfo',),
                ('ParticleInfo',),
-                ('PeriodicTorsionForceImpl',),
                ('PeriodicTorsionInfo',),
                ('PerParticleParameterInfo',),
                ('PiTorsionInfo',),
                ('PlatformData',),
-                ('RBTorsionForceImpl',),
                ('RBTorsionInfo',),
                ('RemoveCMMotionKernel',),
                ('SplineFitter',),
                ('StreamFactory',),
-                ('StreamImpl',),
                ('StretchBendInfo',),
                ('TorsionInfo',),
                ('TorsionTorsionGridInfo',),
@@ -139,14 +104,11 @@ SKIP_METHODS = [('State',),
                ('Platform', 'registerKernelFactory'),
                ('IntegrateRPMDStepKernel',),
                ('RPMDIntegrator',  'getState'),
-                ('DrudeForceImpl',),
                ('CalcDrudeForceKernel',),
                ('IntegrateDrudeLangevinStepKernel',),
                ('IntegrateDrudeSCFStepKernel',),
                ('XmlSerializer',  'serialize'),
                ('XmlSerializer',  'deserialize'),
-                ('fvec4',),
-                ('ivec4',),
 ]

 # The build script assumes method args that are non-const references are
@@ -161,6 +123,8 @@ NO_OUTPUT_ARGS = [('LocalEnergyMinimizer', 'minimize', 'context'),
                  ('AmoebaMultipoleForce', 'addParticle', 'molecularDipole'),
                  ('AmoebaMultipoleForce', 'addParticle', 'molecularQuadrupole'),
                  ('AmoebaMultipoleForce', 'setCovalentMap', 'covalentAtoms'),
+                  ('AmoebaMultipoleForce', 'getElectrostaticPotential', 'context'),
+                  ('AmoebaMultipoleForce', 'getInducedDipoles', 'context'),
 ]

 # SWIG assumes the target language shadow class owns the C++ class
@@ -285,6 +249,7 @@ UNITS = {
 #("AmoebaMultipoleForce",                 "getElectrostaticPotential")                     :  ( None, ('unit.kilojoule_per_mole')),
 #("AmoebaMultipoleForce",                 "getElectrostaticPotential")                     :  ( ('unit.kilojoule_per_mole'), ()),
 ("AmoebaMultipoleForce",                 "getElectrostaticPotential")                     :  ( None, ()),
+("AmoebaMultipoleForce",                 "getInducedDipoles")                             :  ( None, ()),
 ("AmoebaMultipoleForce",                 "getSystemMultipoleMoments")                     :  ( None, ()),

 ("AmoebaOutOfPlaneBendForce",            "getNumOutOfPlaneBends")                         :  ( None, ()),

--- a/wrappers/python/src/swig_doxygen/swig_lib/python/typemaps.i
+++ b/wrappers/python/src/swig_doxygen/swig_lib/python/typemaps.i

 /* Convert python list of tuples to C++ std::vector of Vec3 objects */
-%typemap(in) std::vector<Vec3>& (std::vector<OpenMM::Vec3> vVec) {
+%typemap(in) const std::vector<Vec3>& (std::vector<OpenMM::Vec3> vVec) {
  // typemap -- %typemap(in) std::vector<Vec3>& (std::vector<OpenMM::Vec3> vVec)
  int i, pLength, itemLength;
  double x, y, z;
@@ -34,6 +34,32 @@
  $1 = &vVec;
 }

+/* The following two typemaps cause a non-const vector<Vec3>& to become a return value. */
+%typemap(in, numinputs=0) std::vector<Vec3>& (std::vector<Vec3> temp) {
+    $1 = &temp;
+}
+
+%typemap(argout) std::vector<Vec3>& {
+    int i, n;
+    PyObject *pyList;
+
+    n=(*$1).size(); 
+    pyList=PyList_New(n);
+    PyObject* mm = PyImport_AddModule("simtk.openmm");
+    PyObject* vec3 = PyObject_GetAttrString(mm, "Vec3");
+    for (i=0; i<n; i++) {
+        OpenMM::Vec3& v = (*$1).at(i);
+        PyObject* args = Py_BuildValue("(d,d,d)", v[0], v[1], v[2]);
+        PyObject* pyVec = PyObject_CallObject(vec3, args);
+        Py_DECREF(args);
+        PyList_SET_ITEM(pyList, i, pyVec);
+    }
+    $result = pyList;
+}
+
+/* const vector<Vec3> should NOT become an output. */
+%typemap(argout) const std::vector<Vec3>& {
+}

 /* Convert python tuple to C++ Vec3 object*/
 %typemap(typecheck) Vec3 {