Merged changes from main branch

588a4c9d · peastman · bbf3de23 · ff6af025 · 588a4c9d · 588a4c9d
Commit 588a4c9d authored Dec 10, 2013 by peastman
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,16 +105,20 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
  SET( LIB64  )
 ENDIF( CMAKE_SIZEOF_VOID_P EQUAL 8 )

-# Build universal binaries compatible with OS X 10.7
-IF (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET)
-    SET (CMAKE_OSX_ARCHITECTURES "i386;x86_64" CACHE STRING "The processor architectures to build for" FORCE)
-    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.7" CACHE STRING "The minimum version of OS X to support" FORCE)
-ENDIF (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET)
-
-# Improve the linking behavior of Mac libraries
 IF (APPLE)
+    # Build universal binaries compatible with OS X 10.7
+    IF (NOT CMAKE_OSX_DEPLOYMENT_TARGET)
+        SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.7" CACHE STRING "The minimum version of OS X to support" FORCE)
+    ENDIF (NOT CMAKE_OSX_DEPLOYMENT_TARGET)
+    IF (NOT CMAKE_OSX_ARCHITECTURES)
+        SET (CMAKE_OSX_ARCHITECTURES "i386;x86_64" CACHE STRING "The processor architectures to build for" FORCE)
+    ENDIF (NOT CMAKE_OSX_ARCHITECTURES)
+
+    # Improve the linking behavior of Mac libraries
    SET (CMAKE_INSTALL_NAME_DIR "@rpath")
-    SET_PROPERTY(GLOBAL PROPERTY COMPILE_FLAGS "-stdlib=libc++ -mmacosx-version-min=10.7")
+    SET(EXTRA_COMPILE_FLAGS "-stdlib=libc++")
+ELSE (APPLE)
+    SET(EXTRA_COMPILE_FLAGS)
 ENDIF (APPLE)

 IF(UNIX AND NOT CMAKE_BUILD_TYPE)
@@ -266,7 +270,7 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)

 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY")
 IF(WIN32)
    ADD_DEPENDENCIES(${SHARED_TARGET} PthreadsLibraries)
 ENDIF(WIN32)
@@ -274,7 +278,7 @@ ENDIF(WIN32)
 SET(OPENMM_BUILD_STATIC_LIB OFF CACHE BOOL "Whether to build static OpenMM libraries")
 IF(OPENMM_BUILD_STATIC_LIB)
    ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY")
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY")
 ENDIF(OPENMM_BUILD_STATIC_LIB)

 IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -26,14 +26,16 @@ SET(BUILD_TESTING_STATIC OFF)
 IF(OPENMM_BUILD_STATIC_LIB)
    SET(BUILD_TESTING_STATIC ON)
 ENDIF(OPENMM_BUILD_STATIC_LIB)
-		
+
 FOREACH(EX_ROOT ${CPP_EXAMPLES})
    IF (BUILD_TESTING_SHARED)
        # Link with shared library
        ADD_EXECUTABLE(${EX_ROOT} ${EX_ROOT}.cpp)
        SET_TARGET_PROPERTIES(${EX_ROOT}
-		PROPERTIES
-	      PROJECT_LABEL "Example - ${EX_ROOT}")
+            PROPERTIES
+            PROJECT_LABEL "Example - ${EX_ROOT}"
+            LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+            COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
        TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
    ENDIF (BUILD_TESTING_SHARED)

@@ -42,9 +44,10 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
        SET(EX_STATIC ${EX_ROOT}Static)
        ADD_EXECUTABLE(${EX_STATIC} ${EX_ROOT}.cpp)
        SET_TARGET_PROPERTIES(${EX_STATIC}
-		PROPERTIES
-		COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
-		PROJECT_LABEL "Example - ${EX_STATIC}")
+            PROPERTIES
+            PROJECT_LABEL "Example - ${EX_STATIC}"
+            LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+            COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
        TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
    ENDIF (BUILD_TESTING_STATIC)

@@ -63,8 +66,10 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
            # C++ libraries on the link line.
            ADD_EXECUTABLE(${EX_ROOT} ${EX_ROOT}.c Empty.cpp)
            SET_TARGET_PROPERTIES(${EX_ROOT}
-            PROPERTIES
-              PROJECT_LABEL "Example C - ${EX_ROOT}")
+                PROPERTIES
+                PROJECT_LABEL "Example C - ${EX_ROOT}"
+                LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+                COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
            TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
            ADD_DEPENDENCIES(${EX_ROOT} ApiWrappers)
        ENDIF (BUILD_TESTING_SHARED)
@@ -76,9 +81,10 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
            # C++ libraries on the static link line.
            ADD_EXECUTABLE(${EX_STATIC} ${EX_ROOT}.c Empty.cpp)
            SET_TARGET_PROPERTIES(${EX_STATIC}
-            PROPERTIES
-            COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
-            PROJECT_LABEL "Example C - ${EX_STATIC}")
+                PROPERTIES
+                PROJECT_LABEL "Example C - ${EX_STATIC}"
+                LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+                COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
            TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
            ADD_DEPENDENCIES(${EX_STATIC} ApiWrappers)
        ENDIF (BUILD_TESTING_STATIC)

--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -91,6 +91,9 @@ public:
    fvec4 operator&(fvec4 other) const {
        return _mm_and_ps(val, other);
    }
+    fvec4 operator|(fvec4 other) const {
+        return _mm_or_ps(val, other);
+    }
    fvec4 operator==(fvec4 other) const {
        return _mm_cmpeq_ps(val, other);
    }
@@ -157,6 +160,9 @@ public:
    ivec4 operator&(ivec4 other) const {
        return _mm_and_si128(val, other);
    }
+    ivec4 operator|(ivec4 other) const {
+        return _mm_or_si128(val, other);
+    }
    ivec4 operator==(ivec4 other) const {
        return _mm_cmpeq_epi32(val, other);
    }
@@ -267,5 +273,11 @@ static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }

+// Operations for blending fvec4s based on an ivec4.
+
+static inline fvec4 blend(fvec4 v1, fvec4 v2, ivec4 mask) {
+    return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
+}
+
 #endif /*OPENMM_VECTORIZE_H_*/

--- a/platforms/cpu/CMakeLists.txt
+++ b/platforms/cpu/CMakeLists.txt
@@ -14,10 +14,6 @@
 #   libOpenMMCPU_static[_d].a
 #----------------------------------------------------

-IF (APPLE)
-    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
-ENDIF (APPLE)
-
 SUBDIRS (tests)

 # The source is organized into subdirectories, but we handle them all from

--- a/platforms/cpu/include/AlignedArray.h
+++ b/platforms/cpu/include/AlignedArray.h
+#ifndef OPENMM_ALIGNEDARRAY_H_
+#define OPENMM_ALIGNEDARRAY_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+namespace OpenMM {
+
+/**
+ * This class represents an array in memory whose starting point is guaranteed to
+ * be aligned with a 16 byte boundary.  This can improve the performance of vectorized
+ * code, since loads and stores are more efficient.
+ */
+template <class T>
+class AlignedArray {
+public:
+    /**
+     * Default constructor, to allow AlignedArrays to be used inside collections.
+     */
+    AlignedArray() : dataSize(0), baseData(0), data(0) {
+    }
+    /**
+     * Create an Aligned array that contains a specified number of elements.
+     */
+    AlignedArray(int size) {
+        allocate(size);
+    }
+    ~AlignedArray() {
+        if (baseData != 0)
+            delete[] baseData;
+    }
+    /**
+     * Get the number of elements in the array.
+     */
+    int size() const {
+        return dataSize;
+    }
+    /**
+     * Change the size of the array.  This may cause all contents to be lost.
+     */
+    void resize(int size) {
+        if (dataSize == size)
+            return;
+        if (baseData != 0)
+            delete[] baseData;
+        allocate(size);
+    }
+    /**
+     * Get a reference to an element of the array.
+     */
+    T& operator[](int i) {
+        return data[i];
+    }
+    /**
+     * Get a const reference to an element of the array.
+     */
+    const T& operator[](int i) const {
+        return data[i];
+    }
+private:
+    void allocate(int size) {
+        dataSize = size;
+        baseData = new char[size*sizeof(T)+16];
+        char* offsetData = baseData+15;
+        offsetData -= (long long)offsetData&0xF;
+        data = (T*) offsetData;
+    }
+    int dataSize;
+    char* baseData;
+    T* data;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_ALIGNEDARRAY_H_*/
+
--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
 #define OPENMM_CPU_GBSAOBC_FORCE_H__

+#include "AlignedArray.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
 #include <set>
@@ -84,7 +85,7 @@ public:
     * @param totalEnergy      total energy
     * @param threads          the thread pool to use
     */
-    void computeForce(const std::vector<float>& posq, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+    void computeForce(const AlignedArray<float>& posq, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
@@ -105,10 +106,13 @@ private:
    float logDX, logDXInv;
    // The following variables are used to make information accessible to the individual threads.
    float const* posq;
-    std::vector<std::vector<float> >* threadForce;
+    std::vector<AlignedArray<float> >* threadForce;
    bool includeEnergy;
+    void* atomicCounter;
  
    static const int NUM_TABLE_POINTS;
+    static const float TABLE_MIN;
+    static const float TABLE_MAX;

    /**
     * Compute the displacement and squared distance between a collection of points, optionally using

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#include "AlignedArray.h"
 #include "windowsExportCpu.h"
 #include "openmm/internal/ThreadPool.h"
 #include <set>
@@ -46,7 +47,7 @@ public:
    class Voxels;
    static const int BlockSize;
    CpuNeighborList();
-    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
+    void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    const std::vector<int>& getSortedAtoms() const;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
 #define OPENMM_CPU_NONBONDED_FORCE_H__

+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
 #include "openmm/internal/ThreadPool.h"
@@ -143,7 +144,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
          
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
@@ -156,6 +157,7 @@ private:
        bool periodic;
        bool ewald;
        bool pme;
+        bool tableIsValid;
        const CpuNeighborList* neighborList;
        float periodicBoxSize[3];
        float cutoffDistance, switchingDistance;
@@ -172,8 +174,9 @@ private:
        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
        std::set<int> const* exclusions;
-        std::vector<std::vector<float> >* threadForce;
+        std::vector<AlignedArray<float> >* threadForce;
        bool includeEnergy;
+        void* atomicCounter;

        static const float TWO_OVER_SQRT_PI;
        static const int NUM_TABLE_POINTS;

--- a/platforms/cpu/include/CpuPlatform.h
+++ b/platforms/cpu/include/CpuPlatform.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#include "AlignedArray.h"
 #include "ReferencePlatform.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/ThreadPool.h"
@@ -69,8 +70,8 @@ private:
 class CpuPlatform::PlatformData {
 public:
    PlatformData(int numParticles);
-    std::vector<float> posq;
-    std::vector<std::vector<float> > threadForce;
+    AlignedArray<float> posq;
+    std::vector<AlignedArray<float> > threadForce;
    ThreadPool threads;
    bool isPeriodic;
 };

--- a/platforms/cpu/include/CpuSETTLE.h
+++ b/platforms/cpu/include/CpuSETTLE.h
+#ifndef OPENMM_CPUSETTLE_H_
+#define OPENMM_CPUSETTLE_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "ReferenceSETTLEAlgorithm.h"
+#include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This class uses multiple ReferenceSETTLEAlgorithm objects to execute the algorithm in parallel.
+ */
+class OPENMM_EXPORT CpuSETTLE : public ReferenceConstraintAlgorithm {
+public:
+    class ApplyToPositionsTask;
+    class ApplyToVelocitiesTask;
+    CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads);
+    ~CpuSETTLE();
+
+    /**
+     * Apply the constraint algorithm.
+     * 
+     * @param atomCoordinates  the original atom coordinates
+     * @param atomCoordinatesP the new atom coordinates
+     * @param inverseMasses    1/mass
+     * @param tolerance        the constraint tolerance
+     */
+    void apply(std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& atomCoordinatesP, std::vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance);
+
+    /**
+     * Apply the constraint algorithm to velocities.
+     * 
+     * @param atomCoordinates  the atom coordinates
+     * @param atomCoordinatesP the velocities to modify
+     * @param inverseMasses    1/mass
+     * @param tolerance        the constraint tolerance
+     */
+    void applyToVelocities(std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& velocities, std::vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance);
+private:
+    std::vector<ReferenceSETTLEAlgorithm*> threadSettle;
+    ThreadPool& threads;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPUSETTLE_H_*/
--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
-GET_PROPERTY(COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS)
-SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -msse4.1")
+SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
@@ -8,6 +7,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_CPU_BUILDING_SHARED_LIBRARY" LINK_FLAGS "${COMPILE_FLAGS}")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
@@ -24,14 +24,16 @@

 #include "CpuGBSAOBCForce.h"
 #include "SimTKOpenMMRealType.h"
-#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
 #include <cmath>

 using namespace std;
 using namespace OpenMM;

-const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 1025;
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
+const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
+const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;

 class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
 public:
@@ -44,22 +46,12 @@ public:
 };

 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
-    logDX = 0.5/NUM_TABLE_POINTS;
+    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
-    vector<double> y(NUM_TABLE_POINTS+1);
-    vector<double> deriv;
-    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        x[i] = 0.5+i*0.5/NUM_TABLE_POINTS;
-        y[i] = log(x[i]);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    logTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        logTable[4*i] = (float) y[i];
-        logTable[4*i+1] = (float) y[i+1];
-        logTable[4*i+2] = (float) (deriv[i]*logDX*logDX/6);
-        logTable[4*i+3] = (float) (deriv[i+1]*logDX*logDX/6);
+    logTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double x = TABLE_MIN+i*logDX;
+        logTable[i] = log(x);
    }
 }

@@ -93,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
    obcChain.resize(params.size()+3);
 }

-void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->posq = &posq[0];
@@ -104,16 +96,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
    threadBornForces.resize(numThreads);
    for (int i = 0; i < numThreads; i++)
        threadBornForces[i].resize(particleParams.size()+3);
+    gmx_atomic_t counter;
+    this->atomicCounter = &counter;
    
    // Signal the threads to start running and wait for them to finish.
    
    ComputeTask task(*this);
+    gmx_atomic_set(&counter, 0);
    threads.execute(task);
    threads.waitForThreads(); // Compute Born radii
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // Compute surface area term
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // First loop
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // Second loop
    
@@ -141,8 +139,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {

    // Calculate Born radii

-    for (int blockStart = start; blockStart < end; blockStart += 4) {
-        int numInBlock = min(4, end-blockStart);
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomRadius[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -213,7 +214,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    vector<float>& bornForces = threadBornForces[threadIndex];
    for (int i = 0; i < numParticles; i++)
        bornForces[i] = 0.0f;
-    for (int atomI = start; atomI < end; atomI++) {
+    while (true) {
+        int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+        if (atomI >= numParticles)
+            break;
        if (bornRadii[atomI] > 0) {
            float radiusI = particleParams[atomI].first + dielectricOffset;
            float r = radiusI + probeRadius;
@@ -235,8 +239,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
        preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
    else
        preFactor = 0.0f;
-    for (int blockStart = start; blockStart < end; blockStart += 4) {
-        int numInBlock = min(4, end-blockStart);
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomCharge[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -303,13 +310,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {

    // Second loop of Born energy computation.

-    for (int blockStart = start; blockStart < end; blockStart += 4) {
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
        fvec4 bornForce(0.0f);
        for (int i = 0; i < numThreads; i++)
            bornForce += fvec4(&threadBornForces[i][blockStart]);
        fvec4 radii(&bornRadii[blockStart]);
        bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
-        int numInBlock = min(4, end-blockStart);
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomRadius[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -351,14 +361,13 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
            fvec4 logRatio = fastLog(u_ij/l_ij);
            fvec4 t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
            fvec4 de = bornForce*t3*rInverse;
+            de = blend(0.0f, de, include);
            fvec4 result[4] = {dx*de, dy*de, dz*de, 0.0f};
            transpose(result[0], result[1], result[2], result[3]);
            fvec4 atomForce(forces+4*atomJ);
            for (int j = 0; j < 4; j++) {
-                if (include[j]) {
-                    blockAtomForce[j] += result[j];
-                    atomForce -= result[j];
-                }
+                blockAtomForce[j] += result[j];
+                atomForce -= result[j];
            }
            atomForce.store(forces+4*atomJ);
        }
@@ -385,21 +394,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
 fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
    // Evaluate log(x) using a lookup table for speed.

-    float y[4];
-    fvec4 x1 = (x-0.5f)*logDXInv;
+    if (any((x < TABLE_MIN) | (x >= TABLE_MAX)))
+        return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
+    fvec4 x1 = (x-TABLE_MIN)*logDXInv;
    ivec4 index = floor(x1);
-    fvec4 coeff[4];
-    coeff[1] = x1-index;
-    coeff[0] = 1.0f-coeff[1];
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
-    static float maxdiff = 0.0f;
-    for (int i = 0; i < 4; i++) {
-        if (index[i] >= 0 && index[i] < NUM_TABLE_POINTS)
-            y[i] = dot4(coeff[i], fvec4(&logTable[4*index[i]]));
-        else
-            y[i] = logf(x[i]);
-    }
-    return fvec4(y);
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&logTable[index[0]]);
+    fvec4 t2(&logTable[index[1]]);
+    fvec4 t3(&logTable[index[2]]);
+    fvec4 t4(&logTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
 }
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -81,16 +81,16 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
    
    // Convert the positions to single precision and apply periodic boundary conditions
    
-    vector<float>& posq = data.posq;
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    RealVec boxSize = extractBoxSize(context);
-    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
+    double invBoxSize[3] = {1/boxSize[0], 1/boxSize[1], 1/boxSize[2]};
    int numParticles = context.getSystem().getNumParticles();
    if (data.isPeriodic)
        for (int i = 0; i < numParticles; i++)
            for (int j = 0; j < 3; j++) {
                RealOpenMM x = posData[i][j];
-                double base = floor(x/boxSize[j])*boxSize[j];
+                double base = floor(x*invBoxSize[j])*boxSize[j];
                posq[4*i+j] = (float) (x-base);
            }
    else
@@ -255,12 +255,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            }
        }
    }
-    vector<float>& posq = data.posq;
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    vector<RealVec>& forceData = extractForces(context);
    RealVec boxSize = extractBoxSize(context);
    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
-    double energy = ewaldSelfEnergy;
+    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
    if (nonbondedMethod != NoCutoff) {
@@ -330,7 +330,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
            Vec3 periodicBoxSize(boxSize[0], boxSize[1], boxSize[2]);
            optimizedPme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, periodicBoxSize, includeEnergy);
-            optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
            nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -48,6 +48,8 @@ const int CpuNeighborList::BlockSize = 4;
 class VoxelIndex 
 {
 public:
+    VoxelIndex() : x(0), y(0) {
+    }
    VoxelIndex(int x, int y) : x(x), y(y) {
    }
    int x;
@@ -173,6 +175,9 @@ public:
        float centerPos[4];
        blockCenter.store(centerPos);
        VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
+        VoxelIndex atomVoxelIndex[BlockSize];
+        for (int i = 0; i < (int) blockAtoms.size(); i++)
+            atomVoxelIndex[i] = getVoxelIndex(&atomLocations[4*blockAtoms[i]]);
        int startx = centerVoxelIndex.x-dIndexX;
        int starty = centerVoxelIndex.y-dIndexY;
        int endx = centerVoxelIndex.x+dIndexX;
@@ -194,39 +199,59 @@ public:
            voxelIndex.x = x;
            if (usePeriodic)
                voxelIndex.x = (x < 0 ? x+nx : (x >= nx ? x-nx : x));
-            float dx = max(0.0f, voxelSizeX*max(0, abs(centerVoxelIndex.x-x)-1)-blockWidth[0]);
            for (int y = starty; y <= endy; ++y) {
                voxelIndex.y = y;
                if (usePeriodic)
                    voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y));
-                float dy = max(0.0f, voxelSizeY*max(0, abs(centerVoxelIndex.y-y)-1)-blockWidth[1]);
                
                // Identify the range of atoms within this bin we need to search.  When using periodic boundary
                // conditions, there may be two separate ranges.
                
-                float dz = maxDistance+blockWidth[2];
-                dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
+                float minz = centerPos[2];
+                float maxz = centerPos[2];
+                fvec4 offset(voxelSizeX*x+(usePeriodic ? 0.0f : minx), voxelSizeY*y+(usePeriodic ? 0.0f : miny), 0, 0);
+                for (int k = 0; k < (int) blockAtoms.size(); k++) {
+                    const float* atomPos = &atomLocations[4*blockAtoms[k]];
+                    fvec4 posVec(atomPos);
+                    fvec4 delta1 = offset-posVec;
+                    fvec4 delta2 = delta1+fvec4(voxelSizeX, voxelSizeY, 0, 0);
+                    if (usePeriodic) {
+                        delta1 -= round(delta1*invBoxSize)*boxSize;
+                        delta2 -= round(delta2*invBoxSize)*boxSize;
+                    }
+                    fvec4 delta = min(abs(delta1), abs(delta2));
+                    float dx = (x == atomVoxelIndex[k].x ? 0.0f : delta[0]);
+                    float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
+                    float dist2 = maxDistanceSquared-dx*dx-dy*dy;
+                    if (dist2 > 0) {
+                        float dist = sqrtf(dist2);
+                        minz = min(minz, atomPos[2]-dist);
+                        maxz = max(maxz, atomPos[2]+dist);
+                    }
+                }
+                if (minz == maxz)
+                    continue;
                bool needPeriodic = (centerPos[0]-blockWidth[0] < maxDistance || centerPos[0]+blockWidth[0] > periodicBoxSize[0]-maxDistance ||
                                     centerPos[1]-blockWidth[1] < maxDistance || centerPos[1]+blockWidth[1] > periodicBoxSize[1]-maxDistance ||
-                                     centerPos[2]-dz < 0.0f || centerPos[2]+dz > periodicBoxSize[2]);
+                                     minz < 0.0f || maxz > periodicBoxSize[2]);
                int rangeStart[2];
                int rangeEnd[2];
-                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);
+                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, minz);
                if (needPeriodic) {
                    numRanges = 2;
-                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
+                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
                    if (rangeStart[0] > 0) {
                        rangeStart[1] = 0;
-                        rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz-periodicBoxSize[2]), rangeStart[0]);
+                        rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, maxz-periodicBoxSize[2]), rangeStart[0]);
                    }
                    else {
-                        rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz+periodicBoxSize[2]), rangeEnd[0]);
+                        rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, minz+periodicBoxSize[2]), rangeEnd[0]);
                        rangeEnd[1] = bins[voxelIndex.x][voxelIndex.y].size();
                    }
                }
                else {
                    numRanges = 1;
-                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
+                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
                }
                
                // Loop over atoms and check to see if they are neighbors of this block.
@@ -307,7 +332,7 @@ public:
 CpuNeighborList::CpuNeighborList() {
 }

-void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
+void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
    blockNeighbors.resize(numBlocks);
@@ -353,8 +378,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    if (!usePeriodic)
        edgeSizeX = edgeSizeY = maxDistance; // TODO - adjust this as needed
    else {
-        edgeSizeX = 0.5f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
-        edgeSizeY = 0.5f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
+        edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
+        edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
    }
    Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
    for (int i = 0; i < numAtoms; i++) {

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -29,8 +29,8 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"

 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -40,7 +40,7 @@ using namespace std;
 using namespace OpenMM;

 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
-const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;
+const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;

 class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
 public:
@@ -58,21 +58,22 @@ public:

   --------------------------------------------------------------------------------------- */

-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

-     Set the force to use a cutoff.
+   Set the force to use a cutoff.

-     @param distance            the cutoff distance
-     @param neighbors           the neighbor list to use
-     @param solventDielectric   the dielectric constant of the bulk solvent
+   @param distance            the cutoff distance
+   @param neighbors           the neighbor list to use
+   @param solventDielectric   the dielectric constant of the bulk solvent

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
-
+void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
+    if (distance != cutoffDistance)
+        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
    neighborList = &neighbors;
@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */

  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      numRx = kmaxx;
      numRy = kmaxy;
@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */

  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      meshDim[0] = meshSize[0];
      meshDim[1] = meshSize[1];
@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

  
 void CpuNonbondedForce::tabulateEwaldScaleFactor() {
-    ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
+    if (tableIsValid)
+        return;
+    tableIsValid = true;
+    ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
    ewaldDXInv = 1.0f/ewaldDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
-    vector<double> y(NUM_TABLE_POINTS+1);
-    vector<double> deriv;
-    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
+    ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double r = i*ewaldDX;
        double alphaR = alphaEwald*r;
-        x[i] = r;
-        y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        ewaldScaleTable[4*i] = (float) y[i];
-        ewaldScaleTable[4*i+1] = (float) y[i+1];
-        ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
-        ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
+        ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
    }
 }
  
@@ -291,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c


 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
    threadEnergy.resize(threads.getNumThreads());
+    gmx_atomic_t counter;
+    gmx_atomic_set(&counter, 0);
+    this->atomicCounter = &counter;
    
    // Signal the threads to start running and wait for them to finish.
    
@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    if (ewald || pme) {
        // Compute the interactions from the neighbor list.

-        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
-            calculateBlockEwaldIxn(i, forces, energyPtr, boxSize, invBoxSize);
+        while (true) {
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }

        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.

@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    else if (cutoff) {
        // Compute the interactions from the neighbor list.

-        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
-            calculateBlockIxn(i, forces, energyPtr, boxSize, invBoxSize);
+        while (true) {
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
    }
    else {
        // Loop over all atom pairs

-        for (int i = threadIndex; i < numberOfAtoms; i += numThreads){
+        while (true) {
+            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (i >= numberOfAtoms)
+                break;
            for (int j = i+1; j < numberOfAtoms; j++)
                if (exclusions[j].find(i) == exclusions[j].end())
                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
@@ -461,12 +472,12 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
                    break;
                }
    }
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
    
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    bool include[4];
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
        
@@ -475,43 +486,50 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        
        // Compute the distances to the block atoms.
        
-        bool any = false;
        fvec4 dx, dy, dz, r2;
        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        for (int j = 0; j < 4; j++) {
-            include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || r2[j] < cutoffDistance*cutoffDistance));
-            any |= include[j];
-        }
-        if (!any)
+        ivec4 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
            continue; // No interactions to compute.
        
        // Compute the interactions.
        
        fvec4 r = sqrt(r2);
        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 switchValue(1.0f), switchDeriv(0.0f);
-        if (useSwitch) {
-            fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
-            switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-            switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
+        fvec4 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec4 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec4 sig6 = sig2*sig2*sig2;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
        }
-        fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-        fvec4 sig2 = inverseR*sig;
-        sig2 *= sig2;
-        fvec4 sig6 = sig2*sig2*sig2;
-        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
-        fvec4 dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
        if (cutoff)
            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
        else
            dEdR += chargeProd*inverseR;
        dEdR *= inverseR*inverseR;
-        fvec4 energy = eps*(sig6-1.0f)*sig6;
-        if (useSwitch) {
-            dEdR -= energy*switchDeriv*inverseR;
-            energy *= switchValue;
-        }

        // Accumulate energies.

@@ -520,27 +538,25 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
                energy += chargeProd*(inverseR+krf*r2-crf);
            else
                energy += chargeProd*inverseR;
-            for (int j = 0; j < 4; j++)
-                if (include[j])
-                    *totalEnergy += energy[j];
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot4(energy, 1.0f);
        }

        // Accumulate forces.

+        dEdR = blend(0.0f, dEdR, include);
        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
        transpose(result[0], result[1], result[2], result[3]);
        fvec4 atomForce(forces+4*atom);
        for (int j = 0; j < 4; j++) {
-            if (include[j]) {
-                blockAtomForce[j] += result[j];
-                atomForce -= result[j];
-            }
+            blockAtomForce[j] += result[j];
+            atomForce -= result[j];
        }
        atomForce.store(forces+4*atom);
    }
    
    // Record the forces on the block atoms.
-    
+
    for (int j = 0; j < 4; j++)
        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
  }
@@ -569,12 +585,12 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
                needPeriodic = true;
                break;
            }
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
    
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    bool include[4];
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
        
@@ -583,60 +599,65 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        
        // Compute the distances to the block atoms.
        
-        bool any = false;
        fvec4 dx, dy, dz, r2;
        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        for (int j = 0; j < 4; j++) {
-            include[j] = (((exclusions[i]>>j)&1) == 0 && r2[j] < cutoffDistance*cutoffDistance);
-            any |= include[j];
-        }
-        if (!any)
+        ivec4 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
            continue; // No interactions to compute.
        
        // Compute the interactions.
        
        fvec4 r = sqrt(r2);
        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 switchValue(1.0f), switchDeriv(0.0f);
-        if (useSwitch) {
-            fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
-            switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-            switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
+        fvec4 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec4 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec4 sig6 = sig2*sig2*sig2;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        fvec4 dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
-        fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-        fvec4 sig2 = inverseR*sig;
-        sig2 *= sig2;
-        fvec4 sig6 = sig2*sig2*sig2;
-        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
-        dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
-        dEdR *= inverseR*inverseR;
-        fvec4 energy = eps*(sig6-1.0f)*sig6;
-        if (useSwitch) {
-            dEdR -= energy*switchDeriv*inverseR;
-            energy *= switchValue;
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
+        dEdR *= inverseR*inverseR;        

        // Accumulate energies.

        if (totalEnergy) {
            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
-            for (int j = 0; j < 4; j++)
-                if (include[j])
-                    *totalEnergy += energy[j];
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot4(energy, 1.0f);
        }

        // Accumulate forces.

+        dEdR = blend(0.0f, dEdR, include);
        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
        transpose(result[0], result[1], result[2], result[3]);
        fvec4 atomForce(forces+4*atom);
        for (int j = 0; j < 4; j++) {
-            if (include[j]) {
-                blockAtomForce[j] += result[j];
-                atomForce -= result[j];
-            }
+            blockAtomForce[j] += result[j];
+            atomForce -= result[j];
        }
        atomForce.store(forces+4*atom);
    }
@@ -683,18 +704,14 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
 fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)

-    float y[4];
    fvec4 x1 = x*ewaldDXInv;
-    ivec4 index = floor(x1);
-    fvec4 coeff[4];
-    coeff[1] = x1-index;
-    coeff[0] = 1.0f-coeff[1];
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
-    for (int i = 0; i < 4; i++) {
-        if (index[i] < NUM_TABLE_POINTS)
-            y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
-    }
-    return fvec4(y);
+    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&ewaldScaleTable[index[0]]);
+    fvec4 t2(&ewaldScaleTable[index[1]]);
+    fvec4 t3(&ewaldScaleTable[index[2]]);
+    fvec4 t4(&ewaldScaleTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
 }
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -32,6 +32,8 @@
 #include "CpuPlatform.h"
 #include "CpuKernelFactory.h"
 #include "CpuKernels.h"
+#include "CpuSETTLE.h"
+#include "ReferenceConstraints.h"
 #include "openmm/internal/hardware.h"

 using namespace OpenMM;
@@ -77,6 +79,12 @@ void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>
    ReferencePlatform::contextCreated(context, properties);
    PlatformData* data = new PlatformData(context.getSystem().getNumParticles());
    contextData[&context] = data;
+    ReferenceConstraints& constraints = *(ReferenceConstraints*) reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData())->constraints;
+    if (constraints.settle != NULL) {
+        CpuSETTLE* parallelSettle = new CpuSETTLE(context.getSystem(), *(ReferenceSETTLEAlgorithm*) constraints.settle, data->threads);
+        delete constraints.settle;
+        constraints.settle = parallelSettle;
+    }
 }

 void CpuPlatform::contextDestroyed(ContextImpl& context) const {
@@ -89,8 +97,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
    return *contextData[&context];
 }

-CpuPlatform::PlatformData::PlatformData(int numParticles) {
-    posq.resize(4*numParticles);
+CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
    int numThreads = threads.getNumThreads();
    threadForce.resize(numThreads);
    for (int i = 0; i < numThreads; i++)

--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CpuSETTLE.h"
+
+using namespace OpenMM;
+using namespace std;
+
+class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
+public:
+    ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
+            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
+            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+    }
+    vector<OpenMM::RealVec>& atomCoordinates;
+    vector<OpenMM::RealVec>& atomCoordinatesP;
+    vector<RealOpenMM>& inverseMasses;
+    RealOpenMM tolerance;
+    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
+};
+
+class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
+public:
+    ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
+            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
+            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+    }
+    vector<OpenMM::RealVec>& atomCoordinates;
+    vector<OpenMM::RealVec>& velocities;
+    vector<RealOpenMM>& inverseMasses;
+    RealOpenMM tolerance;
+    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
+};
+
+CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
+    int numThreads = threads.getNumThreads();
+    int numClusters = settle.getNumClusters();
+    vector<RealOpenMM> mass(system.getNumParticles());
+    for (int i = 0; i < system.getNumParticles(); i++)
+        mass[i] = system.getParticleMass(i);
+    for (int i = 0; i < numThreads; i++) {
+        int start = i*numClusters/numThreads;
+        int end = (i+1)*numClusters/numThreads;
+        if (start != end) {
+            int numThreadClusters = end-start;
+            vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
+            vector<RealOpenMM> distance1(numThreadClusters), distance2(numThreadClusters);
+            for (int j = 0; j < numThreadClusters; j++)
+                settle.getClusterParameters(start+j, atom1[j], atom2[j], atom3[j], distance1[j], distance2[j]);
+            threadSettle.push_back(new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, mass));
+        }
+    }
+}
+
+CpuSETTLE::~CpuSETTLE() {
+    for (int i = 0; i < (int) threadSettle.size(); i++)
+        delete threadSettle[i];
+}
+
+void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
+    ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
+    threads.execute(task);
+    threads.waitForThreads();
+}
+
+void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
+    ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
+    threads.execute(task);
+    threads.waitForThreads();
+}
--- a/platforms/cpu/src/gmx_atomic.h
+++ b/platforms/cpu/src/gmx_atomic.h
--- a/platforms/cpu/tests/CMakeLists.txt
+++ b/platforms/cpu/tests/CMakeLists.txt
@@ -11,7 +11,6 @@ IF( INCLUDE_SERIALIZATION )
    INCLUDE_DIRECTORIES(${OPENMM_DIR}/serialization/include)
    SET( SHARED_OPENMM_SERIALIZATION "OpenMMSerialization" )
 ENDIF( INCLUDE_SERIALIZATION )
-GET_PROPERTY(COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS)

 # Automatically create tests using files named "Test*.cpp"
 FILE(GLOB TEST_PROGS "*Test*.cpp")
@@ -21,7 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS}" LINK_FLAGS "${COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)

 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -35,6 +35,7 @@

 #include "openmm/internal/AssertionUtilities.h"
 #include "openmm/internal/ThreadPool.h"
+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "CpuPlatform.h"
 #include "sfmt/SFMT.h"
@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
    const float boxSize[3] = {20.0f, 15.0f, 22.0f};
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
-    vector<float> positions(4*numParticles);
+    AlignedArray<float> positions(4*numParticles);
    for (int i = 0; i < 4*numParticles; i++)
        if (i%4 < 3)
            positions[i] = boxSize[i%4]*genrand_real2(sfmt);