Merged changes from main branch

588a4c9d · peastman · bbf3de23 · ff6af025 · 588a4c9d · 588a4c9d
Commit 588a4c9d authored Dec 10, 2013 by peastman
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,16 +105,20 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
  SET( LIB64  )
 ENDIF( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-# Build universal binaries compatible with OS X 10.7
+IF (APPLE)
-IF (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET)
+    # Build universal binaries compatible with OS X 10.7
-    SET (CMAKE_OSX_ARCHITECTURES "i386;x86_64" CACHE STRING "The processor architectures to build for" FORCE)
+    IF (NOT CMAKE_OSX_DEPLOYMENT_TARGET)
        SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.7" CACHE STRING "The minimum version of OS X to support" FORCE)
-ENDIF (APPLE AND NOT CMAKE_OSX_DEPLOYMENT_TARGET)
+    ENDIF (NOT CMAKE_OSX_DEPLOYMENT_TARGET)
+    IF (NOT CMAKE_OSX_ARCHITECTURES)
+        SET (CMAKE_OSX_ARCHITECTURES "i386;x86_64" CACHE STRING "The processor architectures to build for" FORCE)
+    ENDIF (NOT CMAKE_OSX_ARCHITECTURES)
-# Improve the linking behavior of Mac libraries
+    # Improve the linking behavior of Mac libraries
-IF (APPLE)
    SET (CMAKE_INSTALL_NAME_DIR "@rpath")
-    SET_PROPERTY(GLOBAL PROPERTY COMPILE_FLAGS "-stdlib=libc++ -mmacosx-version-min=10.7")
+    SET(EXTRA_COMPILE_FLAGS "-stdlib=libc++")
+ELSE (APPLE)
+    SET(EXTRA_COMPILE_FLAGS)
 ENDIF (APPLE)
 IF(UNIX AND NOT CMAKE_BUILD_TYPE)
@@ -266,7 +270,7 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DOPENMM_VALIDATE_BUILDING_SHARED_LIBRARY")
 IF(WIN32)
    ADD_DEPENDENCIES(${SHARED_TARGET} PthreadsLibraries)
 ENDIF(WIN32)
@@ -274,7 +278,7 @@ ENDIF(WIN32)
 SET(OPENMM_BUILD_STATIC_LIB OFF CACHE BOOL "Whether to build static OpenMM libraries")
 IF(OPENMM_BUILD_STATIC_LIB)
    ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY")
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY -DOPENMMM_VALIDATE_BUILDING_STATIC_LIBRARY -DOPENMM_VALIDATE_BUILDING_STATIC_LIBRARY")
 ENDIF(OPENMM_BUILD_STATIC_LIB)
 IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -33,7 +33,9 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
        ADD_EXECUTABLE(${EX_ROOT} ${EX_ROOT}.cpp)
        SET_TARGET_PROPERTIES(${EX_ROOT}
            PROPERTIES
-	      PROJECT_LABEL "Example - ${EX_ROOT}")
+            PROJECT_LABEL "Example - ${EX_ROOT}"
+            LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+            COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
        TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
    ENDIF (BUILD_TESTING_SHARED)
@@ -43,8 +45,9 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
        ADD_EXECUTABLE(${EX_STATIC} ${EX_ROOT}.cpp)
        SET_TARGET_PROPERTIES(${EX_STATIC}
            PROPERTIES
-		COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
+            PROJECT_LABEL "Example - ${EX_STATIC}"
-		PROJECT_LABEL "Example - ${EX_STATIC}")
+            LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+            COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
        TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
    ENDIF (BUILD_TESTING_STATIC)
@@ -64,7 +67,9 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
            ADD_EXECUTABLE(${EX_ROOT} ${EX_ROOT}.c Empty.cpp)
            SET_TARGET_PROPERTIES(${EX_ROOT}
                PROPERTIES
-              PROJECT_LABEL "Example C - ${EX_ROOT}")
+                PROJECT_LABEL "Example C - ${EX_ROOT}"
+                LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+                COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
            TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
            ADD_DEPENDENCIES(${EX_ROOT} ApiWrappers)
        ENDIF (BUILD_TESTING_SHARED)
@@ -77,8 +82,9 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
            ADD_EXECUTABLE(${EX_STATIC} ${EX_ROOT}.c Empty.cpp)
            SET_TARGET_PROPERTIES(${EX_STATIC}
                PROPERTIES
-            COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
+                PROJECT_LABEL "Example C - ${EX_STATIC}"
-            PROJECT_LABEL "Example C - ${EX_STATIC}")
+                LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+                COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
            TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
            ADD_DEPENDENCIES(${EX_STATIC} ApiWrappers)
        ENDIF (BUILD_TESTING_STATIC)

--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -91,6 +91,9 @@ public:
    fvec4 operator&(fvec4 other) const {
        return _mm_and_ps(val, other);
    }
+    fvec4 operator|(fvec4 other) const {
+        return _mm_or_ps(val, other);
+    }
    fvec4 operator==(fvec4 other) const {
        return _mm_cmpeq_ps(val, other);
    }
@@ -157,6 +160,9 @@ public:
    ivec4 operator&(ivec4 other) const {
        return _mm_and_si128(val, other);
    }
+    ivec4 operator|(ivec4 other) const {
+        return _mm_or_si128(val, other);
+    }
    ivec4 operator==(ivec4 other) const {
        return _mm_cmpeq_epi32(val, other);
    }
@@ -267,5 +273,11 @@ static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
+// Operations for blending fvec4s based on an ivec4.
+static inline fvec4 blend(fvec4 v1, fvec4 v2, ivec4 mask) {
+    return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
+}
 #endif /*OPENMM_VECTORIZE_H_*/
--- a/platforms/cpu/CMakeLists.txt
+++ b/platforms/cpu/CMakeLists.txt
@@ -14,10 +14,6 @@
 #   libOpenMMCPU_static[_d].a
 #----------------------------------------------------
-IF (APPLE)
-    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
-ENDIF (APPLE)
 SUBDIRS (tests)
 # The source is organized into subdirectories, but we handle them all from

--- a/platforms/cpu/include/AlignedArray.h
+++ b/platforms/cpu/include/AlignedArray.h
+#ifndef OPENMM_ALIGNEDARRAY_H_
+#define OPENMM_ALIGNEDARRAY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+namespace OpenMM {
+/**
+ * This class represents an array in memory whose starting point is guaranteed to
+ * be aligned with a 16 byte boundary.  This can improve the performance of vectorized
+ * code, since loads and stores are more efficient.
+ */
+template <class T>
+class AlignedArray {
+public:
+    /**
+     * Default constructor, to allow AlignedArrays to be used inside collections.
+     */
+    AlignedArray() : dataSize(0), baseData(0), data(0) {
+    }
+    /**
+     * Create an Aligned array that contains a specified number of elements.
+     */
+    AlignedArray(int size) {
+        allocate(size);
+    }
+    ~AlignedArray() {
+        if (baseData != 0)
+            delete[] baseData;
+    }
+    /**
+     * Get the number of elements in the array.
+     */
+    int size() const {
+        return dataSize;
+    }
+    /**
+     * Change the size of the array.  This may cause all contents to be lost.
+     */
+    void resize(int size) {
+        if (dataSize == size)
+            return;
+        if (baseData != 0)
+            delete[] baseData;
+        allocate(size);
+    }
+    /**
+     * Get a reference to an element of the array.
+     */
+    T& operator[](int i) {
+        return data[i];
+    }
+    /**
+     * Get a const reference to an element of the array.
+     */
+    const T& operator[](int i) const {
+        return data[i];
+    }
+private:
+    void allocate(int size) {
+        dataSize = size;
+        baseData = new char[size*sizeof(T)+16];
+        char* offsetData = baseData+15;
+        offsetData -= (long long)offsetData&0xF;
+        data = (T*) offsetData;
+    }
+    int dataSize;
+    char* baseData;
+    T* data;
+};
+} // namespace OpenMM
+#endif /*OPENMM_ALIGNEDARRAY_H_*/
--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
 #define OPENMM_CPU_GBSAOBC_FORCE_H__
+#include "AlignedArray.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
 #include <set>
@@ -84,7 +85,7 @@ public:
     * @param totalEnergy      total energy
     * @param threads          the thread pool to use
     */
-    void computeForce(const std::vector<float>& posq, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+    void computeForce(const AlignedArray<float>& posq, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
    /**
     * This routine contains the code executed by each thread.
@@ -105,10 +106,13 @@ private:
    float logDX, logDXInv;
    // The following variables are used to make information accessible to the individual threads.
    float const* posq;
-    std::vector<std::vector<float> >* threadForce;
+    std::vector<AlignedArray<float> >* threadForce;
    bool includeEnergy;
+    void* atomicCounter;
    static const int NUM_TABLE_POINTS;
+    static const float TABLE_MIN;
+    static const float TABLE_MAX;
    /**
     * Compute the displacement and squared distance between a collection of points, optionally using

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
+#include "AlignedArray.h"
 #include "windowsExportCpu.h"
 #include "openmm/internal/ThreadPool.h"
 #include <set>
@@ -46,7 +47,7 @@ public:
    class Voxels;
    static const int BlockSize;
    CpuNeighborList();
-    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
+    void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    const std::vector<int>& getSortedAtoms() const;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
 #define OPENMM_CPU_NONBONDED_FORCE_H__
+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
 #include "openmm/internal/ThreadPool.h"
@@ -143,7 +144,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
    /**
     * This routine contains the code executed by each thread.
@@ -156,6 +157,7 @@ private:
        bool periodic;
        bool ewald;
        bool pme;
+        bool tableIsValid;
        const CpuNeighborList* neighborList;
        float periodicBoxSize[3];
        float cutoffDistance, switchingDistance;
@@ -172,8 +174,9 @@ private:
        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
        std::set<int> const* exclusions;
-        std::vector<std::vector<float> >* threadForce;
+        std::vector<AlignedArray<float> >* threadForce;
        bool includeEnergy;
+        void* atomicCounter;
        static const float TWO_OVER_SQRT_PI;
        static const int NUM_TABLE_POINTS;

--- a/platforms/cpu/include/CpuPlatform.h
+++ b/platforms/cpu/include/CpuPlatform.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
+#include "AlignedArray.h"
 #include "ReferencePlatform.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/ThreadPool.h"
@@ -69,8 +70,8 @@ private:
 class CpuPlatform::PlatformData {
 public:
    PlatformData(int numParticles);
-    std::vector<float> posq;
+    AlignedArray<float> posq;
-    std::vector<std::vector<float> > threadForce;
+    std::vector<AlignedArray<float> > threadForce;
    ThreadPool threads;
    bool isPeriodic;
 };

--- a/platforms/cpu/include/CpuSETTLE.h
+++ b/platforms/cpu/include/CpuSETTLE.h
+#ifndef OPENMM_CPUSETTLE_H_
+#define OPENMM_CPUSETTLE_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "ReferenceSETTLEAlgorithm.h"
+#include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"
+#include <vector>
+namespace OpenMM {
+/**
+ * This class uses multiple ReferenceSETTLEAlgorithm objects to execute the algorithm in parallel.
+ */
+class OPENMM_EXPORT CpuSETTLE : public ReferenceConstraintAlgorithm {
+public:
+    class ApplyToPositionsTask;
+    class ApplyToVelocitiesTask;
+    CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads);
+    ~CpuSETTLE();
+    /**
+     * Apply the constraint algorithm.
+     * 
+     * @param atomCoordinates  the original atom coordinates
+     * @param atomCoordinatesP the new atom coordinates
+     * @param inverseMasses    1/mass
+     * @param tolerance        the constraint tolerance
+     */
+    void apply(std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& atomCoordinatesP, std::vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance);
+    /**
+     * Apply the constraint algorithm to velocities.
+     * 
+     * @param atomCoordinates  the atom coordinates
+     * @param atomCoordinatesP the velocities to modify
+     * @param inverseMasses    1/mass
+     * @param tolerance        the constraint tolerance
+     */
+    void applyToVelocities(std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& velocities, std::vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance);
+private:
+    std::vector<ReferenceSETTLEAlgorithm*> threadSettle;
+    ThreadPool& threads;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CPUSETTLE_H_*/
--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
-GET_PROPERTY(COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS)
+SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
-SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -msse4.1")
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
@@ -8,6 +7,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_CPU_BUILDING_SHARED_LIBRARY" LINK_FLAGS "${COMPILE_FLAGS}")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
@@ -24,14 +24,16 @@
 #include "CpuGBSAOBCForce.h"
 #include "SimTKOpenMMRealType.h"
-#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
 #include <cmath>
 using namespace std;
 using namespace OpenMM;
-const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 1025;
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
+const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
+const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
 class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
 public:
@@ -44,22 +46,12 @@ public:
 };
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
-    logDX = 0.5/NUM_TABLE_POINTS;
+    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
+    logTable.resize(NUM_TABLE_POINTS+4);
-    vector<double> y(NUM_TABLE_POINTS+1);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
-    vector<double> deriv;
+        double x = TABLE_MIN+i*logDX;
-    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
+        logTable[i] = log(x);
-        x[i] = 0.5+i*0.5/NUM_TABLE_POINTS;
-        y[i] = log(x[i]);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    logTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        logTable[4*i] = (float) y[i];
-        logTable[4*i+1] = (float) y[i+1];
-        logTable[4*i+2] = (float) (deriv[i]*logDX*logDX/6);
-        logTable[4*i+3] = (float) (deriv[i+1]*logDX*logDX/6);
    }
 }
@@ -93,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
    obcChain.resize(params.size()+3);
 }
-void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    this->posq = &posq[0];
@@ -104,16 +96,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
    threadBornForces.resize(numThreads);
    for (int i = 0; i < numThreads; i++)
        threadBornForces[i].resize(particleParams.size()+3);
+    gmx_atomic_t counter;
+    this->atomicCounter = &counter;
    // Signal the threads to start running and wait for them to finish.
    ComputeTask task(*this);
+    gmx_atomic_set(&counter, 0);
    threads.execute(task);
    threads.waitForThreads(); // Compute Born radii
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // Compute surface area term
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // First loop
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // Second loop
@@ -141,8 +139,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    // Calculate Born radii
-    for (int blockStart = start; blockStart < end; blockStart += 4) {
+    while (true) {
-        int numInBlock = min(4, end-blockStart);
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomRadius[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -213,7 +214,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    vector<float>& bornForces = threadBornForces[threadIndex];
    for (int i = 0; i < numParticles; i++)
        bornForces[i] = 0.0f;
-    for (int atomI = start; atomI < end; atomI++) {
+    while (true) {
+        int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+        if (atomI >= numParticles)
+            break;
        if (bornRadii[atomI] > 0) {
            float radiusI = particleParams[atomI].first + dielectricOffset;
            float r = radiusI + probeRadius;
@@ -235,8 +239,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
        preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
    else
        preFactor = 0.0f;
-    for (int blockStart = start; blockStart < end; blockStart += 4) {
+    while (true) {
-        int numInBlock = min(4, end-blockStart);
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomCharge[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -303,13 +310,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    // Second loop of Born energy computation.
-    for (int blockStart = start; blockStart < end; blockStart += 4) {
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
        fvec4 bornForce(0.0f);
        for (int i = 0; i < numThreads; i++)
            bornForce += fvec4(&threadBornForces[i][blockStart]);
        fvec4 radii(&bornRadii[blockStart]);
        bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
-        int numInBlock = min(4, end-blockStart);
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomRadius[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -351,15 +361,14 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
            fvec4 logRatio = fastLog(u_ij/l_ij);
            fvec4 t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
            fvec4 de = bornForce*t3*rInverse;
+            de = blend(0.0f, de, include);
            fvec4 result[4] = {dx*de, dy*de, dz*de, 0.0f};
            transpose(result[0], result[1], result[2], result[3]);
            fvec4 atomForce(forces+4*atomJ);
            for (int j = 0; j < 4; j++) {
-                if (include[j]) {
                blockAtomForce[j] += result[j];
                atomForce -= result[j];
            }
-            }
            atomForce.store(forces+4*atomJ);
        }
        for (int i = 0; i < numInBlock; i++) {
@@ -385,21 +394,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
 fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
    // Evaluate log(x) using a lookup table for speed.
-    float y[4];
+    if (any((x < TABLE_MIN) | (x >= TABLE_MAX)))
-    fvec4 x1 = (x-0.5f)*logDXInv;
+        return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
+    fvec4 x1 = (x-TABLE_MIN)*logDXInv;
    ivec4 index = floor(x1);
-    fvec4 coeff[4];
+    fvec4 coeff2 = x1-index;
-    coeff[1] = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
-    coeff[0] = 1.0f-coeff[1];
+    fvec4 t1(&logTable[index[0]]);
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
+    fvec4 t2(&logTable[index[1]]);
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
+    fvec4 t3(&logTable[index[2]]);
-    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
+    fvec4 t4(&logTable[index[3]]);
-    static float maxdiff = 0.0f;
+    transpose(t1, t2, t3, t4);
-    for (int i = 0; i < 4; i++) {
+    return coeff1*t1 + coeff2*t2;
-        if (index[i] >= 0 && index[i] < NUM_TABLE_POINTS)
-            y[i] = dot4(coeff[i], fvec4(&logTable[4*index[i]]));
-        else
-            y[i] = logf(x[i]);
-    }
-    return fvec4(y);
 }
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -81,16 +81,16 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
    // Convert the positions to single precision and apply periodic boundary conditions
-    vector<float>& posq = data.posq;
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    RealVec boxSize = extractBoxSize(context);
-    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
+    double invBoxSize[3] = {1/boxSize[0], 1/boxSize[1], 1/boxSize[2]};
    int numParticles = context.getSystem().getNumParticles();
    if (data.isPeriodic)
        for (int i = 0; i < numParticles; i++)
            for (int j = 0; j < 3; j++) {
                RealOpenMM x = posData[i][j];
-                double base = floor(x/boxSize[j])*boxSize[j];
+                double base = floor(x*invBoxSize[j])*boxSize[j];
                posq[4*i+j] = (float) (x-base);
            }
    else
@@ -255,12 +255,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            }
        }
    }
-    vector<float>& posq = data.posq;
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    vector<RealVec>& forceData = extractForces(context);
    RealVec boxSize = extractBoxSize(context);
    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
-    double energy = ewaldSelfEnergy;
+    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
    if (nonbondedMethod != NoCutoff) {
@@ -330,7 +330,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
            Vec3 periodicBoxSize(boxSize[0], boxSize[1], boxSize[2]);
            optimizedPme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, periodicBoxSize, includeEnergy);
-            optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
            nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -48,6 +48,8 @@ const int CpuNeighborList::BlockSize = 4;
 class VoxelIndex 
 {
 public:
+    VoxelIndex() : x(0), y(0) {
+    }
    VoxelIndex(int x, int y) : x(x), y(y) {
    }
    int x;
@@ -173,6 +175,9 @@ public:
        float centerPos[4];
        blockCenter.store(centerPos);
        VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
+        VoxelIndex atomVoxelIndex[BlockSize];
+        for (int i = 0; i < (int) blockAtoms.size(); i++)
+            atomVoxelIndex[i] = getVoxelIndex(&atomLocations[4*blockAtoms[i]]);
        int startx = centerVoxelIndex.x-dIndexX;
        int starty = centerVoxelIndex.y-dIndexY;
        int endx = centerVoxelIndex.x+dIndexX;
@@ -194,39 +199,59 @@ public:
            voxelIndex.x = x;
            if (usePeriodic)
                voxelIndex.x = (x < 0 ? x+nx : (x >= nx ? x-nx : x));
-            float dx = max(0.0f, voxelSizeX*max(0, abs(centerVoxelIndex.x-x)-1)-blockWidth[0]);
            for (int y = starty; y <= endy; ++y) {
                voxelIndex.y = y;
                if (usePeriodic)
                    voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y));
-                float dy = max(0.0f, voxelSizeY*max(0, abs(centerVoxelIndex.y-y)-1)-blockWidth[1]);
                // Identify the range of atoms within this bin we need to search.  When using periodic boundary
                // conditions, there may be two separate ranges.
-                float dz = maxDistance+blockWidth[2];
+                float minz = centerPos[2];
-                dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
+                float maxz = centerPos[2];
+                fvec4 offset(voxelSizeX*x+(usePeriodic ? 0.0f : minx), voxelSizeY*y+(usePeriodic ? 0.0f : miny), 0, 0);
+                for (int k = 0; k < (int) blockAtoms.size(); k++) {
+                    const float* atomPos = &atomLocations[4*blockAtoms[k]];
+                    fvec4 posVec(atomPos);
+                    fvec4 delta1 = offset-posVec;
+                    fvec4 delta2 = delta1+fvec4(voxelSizeX, voxelSizeY, 0, 0);
+                    if (usePeriodic) {
+                        delta1 -= round(delta1*invBoxSize)*boxSize;
+                        delta2 -= round(delta2*invBoxSize)*boxSize;
+                    }
+                    fvec4 delta = min(abs(delta1), abs(delta2));
+                    float dx = (x == atomVoxelIndex[k].x ? 0.0f : delta[0]);
+                    float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
+                    float dist2 = maxDistanceSquared-dx*dx-dy*dy;
+                    if (dist2 > 0) {
+                        float dist = sqrtf(dist2);
+                        minz = min(minz, atomPos[2]-dist);
+                        maxz = max(maxz, atomPos[2]+dist);
+                    }
+                }
+                if (minz == maxz)
+                    continue;
                bool needPeriodic = (centerPos[0]-blockWidth[0] < maxDistance || centerPos[0]+blockWidth[0] > periodicBoxSize[0]-maxDistance ||
                                     centerPos[1]-blockWidth[1] < maxDistance || centerPos[1]+blockWidth[1] > periodicBoxSize[1]-maxDistance ||
-                                     centerPos[2]-dz < 0.0f || centerPos[2]+dz > periodicBoxSize[2]);
+                                     minz < 0.0f || maxz > periodicBoxSize[2]);
                int rangeStart[2];
                int rangeEnd[2];
-                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);
+                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, minz);
                if (needPeriodic) {
                    numRanges = 2;
-                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
+                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
                    if (rangeStart[0] > 0) {
                        rangeStart[1] = 0;
-                        rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz-periodicBoxSize[2]), rangeStart[0]);
+                        rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, maxz-periodicBoxSize[2]), rangeStart[0]);
                    }
                    else {
-                        rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz+periodicBoxSize[2]), rangeEnd[0]);
+                        rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, minz+periodicBoxSize[2]), rangeEnd[0]);
                        rangeEnd[1] = bins[voxelIndex.x][voxelIndex.y].size();
                    }
                }
                else {
                    numRanges = 1;
-                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
+                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
                }
                // Loop over atoms and check to see if they are neighbors of this block.
@@ -307,7 +332,7 @@ public:
 CpuNeighborList::CpuNeighborList() {
 }
-void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
+void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
    blockNeighbors.resize(numBlocks);
@@ -353,8 +378,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    if (!usePeriodic)
        edgeSizeX = edgeSizeY = maxDistance; // TODO - adjust this as needed
    else {
-        edgeSizeX = 0.5f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
+        edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
-        edgeSizeY = 0.5f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
+        edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
    }
    Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
    for (int i = 0; i < numAtoms; i++) {

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -29,8 +29,8 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -40,7 +40,7 @@ using namespace std;
 using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
-const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;
+const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;
 class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
 public:
@@ -58,10 +58,10 @@ public:
   --------------------------------------------------------------------------------------- */
-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
 }
-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------
   Set the force to use a cutoff.
@@ -71,8 +71,9 @@ CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), period
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
+void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
+    if (distance != cutoffDistance)
+        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
    neighborList = &neighbors;
@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      numRx = kmaxx;
      numRy = kmaxy;
@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      meshDim[0] = meshSize[0];
      meshDim[1] = meshSize[1];
@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
 void CpuNonbondedForce::tabulateEwaldScaleFactor() {
-    ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
+    if (tableIsValid)
+        return;
+    tableIsValid = true;
+    ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
    ewaldDXInv = 1.0f/ewaldDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
+    ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
-    vector<double> y(NUM_TABLE_POINTS+1);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
-    vector<double> deriv;
+        double r = i*ewaldDX;
-    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
        double alphaR = alphaEwald*r;
-        x[i] = r;
+        ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
-        y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        ewaldScaleTable[4*i] = (float) y[i];
-        ewaldScaleTable[4*i+1] = (float) y[i+1];
-        ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
-        ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
    }
 }
@@ -291,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
    threadEnergy.resize(threads.getNumThreads());
+    gmx_atomic_t counter;
+    gmx_atomic_set(&counter, 0);
+    this->atomicCounter = &counter;
    // Signal the threads to start running and wait for them to finish.
@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    if (ewald || pme) {
        // Compute the interactions from the neighbor list.
-        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
+        while (true) {
-            calculateBlockEwaldIxn(i, forces, energyPtr, boxSize, invBoxSize);
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    else if (cutoff) {
        // Compute the interactions from the neighbor list.
-        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
+        while (true) {
-            calculateBlockIxn(i, forces, energyPtr, boxSize, invBoxSize);
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
    }
    else {
        // Loop over all atom pairs
-        for (int i = threadIndex; i < numberOfAtoms; i += numThreads){
+        while (true) {
+            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (i >= numberOfAtoms)
+                break;
            for (int j = i+1; j < numberOfAtoms; j++)
                if (exclusions[j].find(i) == exclusions[j].end())
                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
@@ -461,12 +472,12 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
                    break;
                }
    }
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    // Loop over neighbors for this block.
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    bool include[4];
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
@@ -475,43 +486,50 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
        // Compute the distances to the block atoms.
-        bool any = false;
        fvec4 dx, dy, dz, r2;
        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        for (int j = 0; j < 4; j++) {
+        ivec4 include;
-            include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || r2[j] < cutoffDistance*cutoffDistance));
+        char excl = exclusions[i];
-            any |= include[j];
+        if (excl == 0)
-        }
+            include = -1;
-        if (!any)
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
            continue; // No interactions to compute.
        // Compute the interactions.
        fvec4 r = sqrt(r2);
        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 switchValue(1.0f), switchDeriv(0.0f);
+        fvec4 energy, dEdR;
-        if (useSwitch) {
+        float atomEpsilon = atomParameters[atom].second;
-            fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
+        if (atomEpsilon != 0.0f) {
-            switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-            switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
-        }
            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
            fvec4 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec4 sig6 = sig2*sig2*sig2;
-        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
-        fvec4 dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
        if (cutoff)
            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
        else
            dEdR += chargeProd*inverseR;
        dEdR *= inverseR*inverseR;
-        fvec4 energy = eps*(sig6-1.0f)*sig6;
-        if (useSwitch) {
-            dEdR -= energy*switchDeriv*inverseR;
-            energy *= switchValue;
-        }
        // Accumulate energies.
@@ -520,22 +538,20 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
                energy += chargeProd*(inverseR+krf*r2-crf);
            else
                energy += chargeProd*inverseR;
-            for (int j = 0; j < 4; j++)
+            energy = blend(0.0f, energy, include);
-                if (include[j])
+            *totalEnergy += dot4(energy, 1.0f);
-                    *totalEnergy += energy[j];
        }
        // Accumulate forces.
+        dEdR = blend(0.0f, dEdR, include);
        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
        transpose(result[0], result[1], result[2], result[3]);
        fvec4 atomForce(forces+4*atom);
        for (int j = 0; j < 4; j++) {
-            if (include[j]) {
            blockAtomForce[j] += result[j];
            atomForce -= result[j];
        }
-        }
        atomForce.store(forces+4*atom);
    }
@@ -569,12 +585,12 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
                needPeriodic = true;
                break;
            }
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    // Loop over neighbors for this block.
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    bool include[4];
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
@@ -583,61 +599,66 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        // Compute the distances to the block atoms.
-        bool any = false;
        fvec4 dx, dy, dz, r2;
        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        for (int j = 0; j < 4; j++) {
+        ivec4 include;
-            include[j] = (((exclusions[i]>>j)&1) == 0 && r2[j] < cutoffDistance*cutoffDistance);
+        char excl = exclusions[i];
-            any |= include[j];
+        if (excl == 0)
-        }
+            include = -1;
-        if (!any)
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
            continue; // No interactions to compute.
        // Compute the interactions.
        fvec4 r = sqrt(r2);
        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 switchValue(1.0f), switchDeriv(0.0f);
+        fvec4 energy, dEdR;
-        if (useSwitch) {
+        float atomEpsilon = atomParameters[atom].second;
-            fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
+        if (atomEpsilon != 0.0f) {
-            switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-            switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        fvec4 dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
            fvec4 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec4 sig6 = sig2*sig2*sig2;
-        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
-        dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-        dEdR *= inverseR*inverseR;
+            energy = epsSig6*(sig6-1.0f);
-        fvec4 energy = eps*(sig6-1.0f)*sig6;
            if (useSwitch) {
-            dEdR -= energy*switchDeriv*inverseR;
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
+        dEdR *= inverseR*inverseR;        
        // Accumulate energies.
        if (totalEnergy) {
            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
-            for (int j = 0; j < 4; j++)
+            energy = blend(0.0f, energy, include);
-                if (include[j])
+            *totalEnergy += dot4(energy, 1.0f);
-                    *totalEnergy += energy[j];
        }
        // Accumulate forces.
+        dEdR = blend(0.0f, dEdR, include);
        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
        transpose(result[0], result[1], result[2], result[3]);
        fvec4 atomForce(forces+4*atom);
        for (int j = 0; j < 4; j++) {
-            if (include[j]) {
            blockAtomForce[j] += result[j];
            atomForce -= result[j];
        }
-        }
        atomForce.store(forces+4*atom);
    }
@@ -683,18 +704,14 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
 fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-    float y[4];
    fvec4 x1 = x*ewaldDXInv;
-    ivec4 index = floor(x1);
+    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
-    fvec4 coeff[4];
+    fvec4 coeff2 = x1-index;
-    coeff[1] = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
-    coeff[0] = 1.0f-coeff[1];
+    fvec4 t1(&ewaldScaleTable[index[0]]);
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
+    fvec4 t2(&ewaldScaleTable[index[1]]);
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
+    fvec4 t3(&ewaldScaleTable[index[2]]);
-    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
+    fvec4 t4(&ewaldScaleTable[index[3]]);
-    for (int i = 0; i < 4; i++) {
+    transpose(t1, t2, t3, t4);
-        if (index[i] < NUM_TABLE_POINTS)
+    return coeff1*t1 + coeff2*t2;
-            y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
-    }
-    return fvec4(y);
 }
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -32,6 +32,8 @@
 #include "CpuPlatform.h"
 #include "CpuKernelFactory.h"
 #include "CpuKernels.h"
+#include "CpuSETTLE.h"
+#include "ReferenceConstraints.h"
 #include "openmm/internal/hardware.h"
 using namespace OpenMM;
@@ -77,6 +79,12 @@ void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>
    ReferencePlatform::contextCreated(context, properties);
    PlatformData* data = new PlatformData(context.getSystem().getNumParticles());
    contextData[&context] = data;
+    ReferenceConstraints& constraints = *(ReferenceConstraints*) reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData())->constraints;
+    if (constraints.settle != NULL) {
+        CpuSETTLE* parallelSettle = new CpuSETTLE(context.getSystem(), *(ReferenceSETTLEAlgorithm*) constraints.settle, data->threads);
+        delete constraints.settle;
+        constraints.settle = parallelSettle;
+    }
 }
 void CpuPlatform::contextDestroyed(ContextImpl& context) const {
@@ -89,8 +97,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
    return *contextData[&context];
 }
-CpuPlatform::PlatformData::PlatformData(int numParticles) {
+CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
-    posq.resize(4*numParticles);
    int numThreads = threads.getNumThreads();
    threadForce.resize(numThreads);
    for (int i = 0; i < numThreads; i++)

--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CpuSETTLE.h"
+using namespace OpenMM;
+using namespace std;
+class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
+public:
+    ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
+            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
+            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+    }
+    vector<OpenMM::RealVec>& atomCoordinates;
+    vector<OpenMM::RealVec>& atomCoordinatesP;
+    vector<RealOpenMM>& inverseMasses;
+    RealOpenMM tolerance;
+    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
+};
+class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
+public:
+    ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
+            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
+            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+    }
+    vector<OpenMM::RealVec>& atomCoordinates;
+    vector<OpenMM::RealVec>& velocities;
+    vector<RealOpenMM>& inverseMasses;
+    RealOpenMM tolerance;
+    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
+};
+CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
+    int numThreads = threads.getNumThreads();
+    int numClusters = settle.getNumClusters();
+    vector<RealOpenMM> mass(system.getNumParticles());
+    for (int i = 0; i < system.getNumParticles(); i++)
+        mass[i] = system.getParticleMass(i);
+    for (int i = 0; i < numThreads; i++) {
+        int start = i*numClusters/numThreads;
+        int end = (i+1)*numClusters/numThreads;
+        if (start != end) {
+            int numThreadClusters = end-start;
+            vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
+            vector<RealOpenMM> distance1(numThreadClusters), distance2(numThreadClusters);
+            for (int j = 0; j < numThreadClusters; j++)
+                settle.getClusterParameters(start+j, atom1[j], atom2[j], atom3[j], distance1[j], distance2[j]);
+            threadSettle.push_back(new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, mass));
+        }
+    }
+}
+CpuSETTLE::~CpuSETTLE() {
+    for (int i = 0; i < (int) threadSettle.size(); i++)
+        delete threadSettle[i];
+}
+void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
+    ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
+    threads.execute(task);
+    threads.waitForThreads();
+}
+void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
+    ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
+    threads.execute(task);
+    threads.waitForThreads();
+}
--- a/platforms/cpu/src/gmx_atomic.h
+++ b/platforms/cpu/src/gmx_atomic.h
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- 
+*
+* Copyright (c) 2004-2008, Erik Lindahl <lindahl@cbr.su.se>
+*
+*  Unfortunately, some of the constructs in this file are _very_ sensitive
+*  to compiler optimizations and architecture changes. If you find any such
+*  errors, please send a message to lindahl@cbr.su.se to help us fix the
+*  upstream version too.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+* And Hey:
+* Gnomes, ROck Monsters And Chili Sauce
+*/
+#ifndef _GMX_ATOMIC_H_
+#define _GMX_ATOMIC_H_
+/*! \file gmx_atomic.h
+ *
+ *  @brief Atomic operations for fast SMP synchronization
+ *
+ *  This file defines atomic integer operations and spinlocks for 
+ *  fast synchronization in performance-critical regions of gromacs.
+ *
+ *  In general, the best option is to use functions without explicit 
+ *  locking, e.g. gmx_atomic_fetch_add() or gmx_atomic_cmpxchg().
+ *
+ *  Not all architecture support atomic operations though inline assembly,
+ *  and even if they do it might not be implemented here. In that case
+ *  we use a fallback mutex implementation, so you can always count on
+ *  the function interfaces working in Gromacs.
+ *
+ *  Don't use spinlocks in non-performance-critical regions like file I/O.
+ *  Since they always spin busy they would waste CPU cycles instead of 
+ *  properly yielding to a computation thread while waiting for the disk.
+ *
+ *  Finally, note that all our spinlock operations are defined to return
+ *  0 if initialization or locking completes successfully.
+ *  This is the opposite of some other implementations, but the same standard
+ *  as used for pthread mutexes. So, if e.g. are trying to lock a spinlock,
+ *  you will have gotten the lock if the return value is 0.
+ * 
+ *  gmx_spinlock_islocked(x) obviously still returns 1 if the lock is locked,
+ *  and 0 if it is available, though...
+ */
+#include <stdio.h>
+#include <pthread.h>
+#ifdef __cplusplus
+extern "C" 
+{  
+#endif
+#if 0
+} /* Avoids screwing up auto-indentation */
+#endif
+#if ( ( (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)) && \
+        (defined(i386) || defined(__x86_64__)) )                                      \
+      || defined (DOXYGEN) )
+/* This code is executed for x86 and x86-64, with these compilers:
+ * GNU
+ * Intel 
+ * Pathscale
+ * All these support GCC-style inline assembly. 
+ * We also use this section for the documentation.
+ */
+/*! \brief Memory barrier operation
+ *
+ *  Modern CPUs rely heavily on out-of-order execution, and one common feature
+ *  is that load/stores might be reordered. Also, when using inline assembly
+ *  the compiler might already have loaded the variable we are changing into
+ *  a register, so any update to memory won't be visible.
+ *
+ *  This command creates a memory barrier, i.e. all memory results before
+ *  it in the code should be visible to all memory operations after it - the
+ *  CPU cannot propagate load/stores across it.
+ */
+#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
+/* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
+#if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
+#define __builtin_constant_p(i) (1)
+#endif
+/*! \brief Gromacs atomic operations datatype
+ *
+ *  Portable synchronization primitives like mutexes are effective for
+ *  many purposes, but usually not very high performance.
+ *  One of the problem is that you have the overhead of a function call,
+ *  and another is that Mutexes often have extra overhead to make the
+ *  scheduling fair. Finally, if performance is important we don't want
+ *  to suspend the thread if we cannot lock a mutex, but spin-lock at 100%
+ *  CPU usage until the resources is available (e.g. increment a counter).
+ *
+ *  These things can often be implemented with inline-assembly or other
+ *  system-dependent functions, and we provide such functionality for the
+ *  most common platforms. For portability we also have a fallback 
+ *  implementation using a mutex for locking.
+ *
+ *  Performance-wise, the fastest solution is always to avoid locking 
+ *  completely (obvious, but remember it!). If you cannot do that, the
+ *  next best thing is to use atomic operations that e.g. increment a
+ *  counter without explicit locking. Spinlocks are useful to lock an
+ *  entire region, but leads to more overhead and can be difficult to
+ *  debug - it is up to you to make sure that only the thread owning the
+ *  lock unlocks it!
+ *
+ *  You should normally NOT use atomic operations for things like 
+ *  I/O threads. These should yield to other threads while waiting for 
+ *  the disk instead of spinning at 100% CPU usage.
+ *
+ *  It is imperative that you use the provided routines for reading
+ *  and writing, since some implementations require memory barriers before
+ *  the CPU or memory sees an updated result. The structure contents is
+ *  only visible here so it can be inlined for performance - it might
+ *  change without further notice.
+ *
+ *  \note No initialization is required for atomic variables.
+ *
+ *  Currently, we have (real) atomic operations for:
+ *
+ *  - x86 or x86_64, using GNU compilers
+ *  - x86 or x86_64, using Intel compilers 
+ *  - x86 or x86_64, using Pathscale compilers
+ *  - Itanium, using GNU compilers 
+ *  - Itanium, using Intel compilers
+ *  - Itanium, using HP compilers
+ *  - PowerPC, using GNU compilers 
+ *  - PowerPC, using IBM AIX compilers 
+ *  - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
+ */
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+/*! \brief Gromacs spinlock
+ *
+ *  Spinlocks provide a faster synchronization than mutexes,
+ *  although they consume CPU-cycles while waiting. They are implemented
+ *  with atomic operations and inline assembly whenever possible, and
+ *  otherwise we use a fallback implementation where a spinlock is identical
+ *  to a mutex (this is one of the reasons why you have to initialize them).
+ *
+ *  There are no guarantees whatsoever about fair scheduling or
+ *  debugging if you make a mistake and unlock a variable somebody
+ *  else has locked - performance is the primary goal of spinlocks.
+ *
+ */
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+/*! \brief Spinlock static initializer
+ *
+ *  This is used for static spinlock initialization, and has the same
+ *  properties as GMX_THREAD_MUTEX_INITIALIZER has for mutexes.
+ *  This is only for inlining in the gmx_thread.h header file. Whether
+ *  it is 0, 1, or something else when unlocked depends on the platform.
+ *  Don't assume anything about it. It might even be a mutex when using the
+ * fallback implementation!
+ */
+#define GMX_SPINLOCK_INITIALIZER   { 1 }
+/*! \brief Return value of an atomic integer 
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param  a   Atomic variable to read
+ *  \return     Integer value of the atomic variable
+ */
+#define gmx_atomic_read(a)  ((a)->value) 
+/*! \brief Write value to an atomic integer 
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param  a   Atomic variable
+ *  \param  i   Integer to set the atomic variable to.
+ */
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+/*! \brief Add integer to atomic variable
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param a   atomic datatype to modify
+ *  \param i   integer to increment with. Use i<0 to subtract atomically.
+ *
+ *  \return The new value (after summation).
+ */
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *     a, 
+                      volatile int       i)
+{
+    int __i;
+    __i = i;
+    __asm__ __volatile__("lock ; xaddl %0, %1;"
+                         :"=r"(i) :"m"(a->value), "0"(i));
+    return i + __i;
+}  
+/*! \brief Add to variable, return the old value.
+ *
+ *  This operation is quite useful for synchronization counters.
+ *  By performing a fetchadd with N, a thread can e.g. reserve a chunk 
+ *  with the next N iterations, and the return value is the index
+ *  of the first element to treat.
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param a   atomic datatype to modify
+ *  \param i   integer to increment with. Use i<0 to subtract atomically.
+ *
+ *  \return    The value of the atomic variable before addition.
+ */
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     volatile int       i)
+{
+    int __i;
+    __i = i;
+    __asm__ __volatile__("lock ; xaddl %0, %1;"
+                         :"=r"(i) :"m"(a->value), "0"(i));
+    return i;
+}
+/*! \brief Atomic compare-exchange operation
+ *
+ *   The \a old value is compared with the memory value in the atomic datatype.
+ *   If the are identical, the atomic type is updated to the new value, 
+ *   and otherwise left unchanged. 
+ *  
+ *   This is a very useful synchronization primitive: You can start by reading
+ *   a value (without locking anything), perform some calculations, and then
+ *   atomically try to update it in memory unless it has changed. If it has
+ *   changed you will get an error return code - reread the new value
+ *   an repeat the calculations in that case.
+ *
+ *   \param a        Atomic datatype ('memory' value)
+ *   \param oldval   Integer value read from the atomic type at an earlier point
+ *   \param newval   New value to write to the atomic type if it currently is
+ *                   identical to the old value.
+ *
+ *   \return The value of the atomic memory variable in memory when this 
+ *           instruction was executed. This, if the operation succeeded the
+ *           return value was identical to the \a old parameter, and if not
+ *           it returns the updated value in memory so you can repeat your
+ *           operations on it. 
+ *
+ *   \note   The exchange occured if the return value is identical to \a old.
+ */
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *    a, 
+                   int               oldval,
+                   int               newval)
+{
+    volatile unsigned long prev;
+    __asm__ __volatile__("lock ; cmpxchgl %1,%2"
+                         : "=a"(prev)
+                         : "q"(newval), "m"(a->value), "0"(oldval)
+                         : "memory");
+    return prev;
+}
+/*! \brief Initialize spinlock
+ *
+ *  In theory you can call this from multiple threads, but remember
+ *  that we don't check for errors. If the first thread proceeded to
+ *  lock the spinlock after initialization, the second will happily
+ *  overwrite the contents and unlock it without warning you.
+ *
+ *  \param x      Gromacs spinlock pointer.
+ */
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *   x)
+{
+    x->lock = 1;
+}
+/*! \brief Acquire spinlock
+ *
+ *  This routine blocks until the spinlock is available, and
+ *  the locks it again before returning.
+ *
+ *  \param x     Gromacs spinlock pointer
+ */
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+	__asm__ __volatile__("\n1:\t" 
+						 "lock ; decb %0\n\t" 
+						 "jns 3f\n" 
+						 "2:\t" 
+						 "rep;nop\n\t" 
+						 "cmpb $0,%0\n\t" 
+						 "jle 2b\n\t" 
+						 "jmp 1b\n" 
+						 "3:\n\t" 
+						 :"=m" (x->lock) : : "memory"); 
+}
+/*! \brief Attempt to acquire spinlock
+ *
+ * This routine acquires the spinlock if possible, but if 
+ * already locked it return an error code immediately.
+ *
+ *  \param x     Gromacs spinlock pointer
+ *
+ * \return 0 if the mutex was available so we could lock it,
+ *         otherwise a non-zero integer (1) if the lock is busy.
+ */
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+	char old_value;
+    __asm__ __volatile__("xchgb %b0,%1"
+                         :"=q" (old_value), "=m" (x->lock)
+						 :"0" (0) : "memory");
+    return (old_value <= 0);
+}
+/*! \brief Release spinlock
+ *
+ *  \param x     Gromacs spinlock pointer
+ *
+ *  Unlocks the spinlock, regardless if which thread locked it.
+ */
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+	char old_value = 1;
+	__asm__ __volatile__(
+                         "xchgb %b0, %1" 
+                         :"=q" (old_value), "=m" (x->lock) 
+                         :"0" (old_value) : "memory"
+                         );
+}
+/*! \brief Check if spinlock is locked
+ *
+ *  This routine returns immediately with the lock status.
+ *
+ *  \param x  Gromacs spinlock pointer
+ *
+ *  \return 1 if the spinlock is locked, 0 otherwise.
+ */
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *  x)
+{
+    return (*(volatile signed char *)(&(x)->lock) <= 0);
+}
+/*! \brief Wait for a spinlock to become available
+ *
+ *  This routine blocks until the spinlock is unlocked, 
+ *  but in contrast to gmx_spinlock_lock() it returns without 
+ *  trying to lock the spinlock.
+ *
+ *  \param x  Gromacs spinlock pointer
+ */
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    do 
+    {
+        gmx_atomic_memory_barrier(); 
+    } 
+    while(gmx_spinlock_islocked(x));
+}
+#elif ( defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__)))
+/* PowerPC using proper GCC inline assembly. 
+ * Recent versions of xlC (>=7.0) _partially_ support this, but since it is
+ * not 100% compatible we provide a separate implementation for xlC in
+ * the next section.
+ */
+/* Compiler-dependent stuff: GCC memory barrier */
+#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *    a, 
+                      int               i)
+{
+    int t;
+	__asm__ __volatile__("1:     lwarx   %0,0,%2\n"
+                         "\tadd     %0,%1,%0\n"
+                         "\tstwcx.  %0,0,%2 \n"
+                         "\tbne-    1b"
+                         "\tisync\n"
+                         : "=&r" (t)
+						 : "r" (i), "r" (&a->value)
+						 : "cc" , "memory");
+    return t;
+}
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int t;
+    __asm__ __volatile__("\teieio\n"
+                         "1:     lwarx   %0,0,%2\n"                         
+                         "\tadd     %0,%1,%0\n"
+                         "\tstwcx.  %0,0,%2 \n"
+                         "\tbne-    1b\n"
+                         "\tisync\n"
+                         : "=&r" (t)
+                         : "r" (i), "r" (&a->value)
+                         : "cc", "memory");
+    return (t - i);    
+}
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *       a,
+                   int                  oldval,
+                   int                  newval)
+{
+    int prev;
+    __asm__ __volatile__ ("1:    lwarx   %0,0,%2 \n"
+                          "\tcmpw    0,%0,%3 \n"
+                          "\tbne     2f \n"
+                          "\tstwcx.  %4,0,%2 \n"
+                          "bne-    1b\n"
+                          "\tsync\n"
+                          "2:\n"
+                          : "=&r" (prev), "=m" (a->value)
+                          : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value)
+                          : "cc", "memory");
+    return prev;
+}
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+    unsigned int tmp;
+    __asm__ __volatile__("\tb      1f\n"
+                         "2:      lwzx    %0,0,%1\n"
+                         "\tcmpwi   0,%0,0\n"
+                         "\tbne+    2b\n"
+                         "1:      lwarx   %0,0,%1\n"
+                         "\tcmpwi   0,%0,0\n"
+                         "\tbne-    2b\n"
+                         "\tstwcx.  %2,0,%1\n"
+                         "bne-    2b\n"
+                         "\tisync\n"
+                         : "=&r"(tmp)
+                         : "r"(&x->lock), "r"(1)
+                         : "cr0", "memory");
+}
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+    unsigned int old, t;
+    unsigned int mask = 1;
+    volatile unsigned int *p = &x->lock;
+    __asm__ __volatile__("\teieio\n"
+                         "1:      lwarx   %0,0,%4 \n"
+                         "\tor      %1,%0,%3 \n"
+                         "\tstwcx.  %1,0,%4 \n"
+                         "\tbne     1b\n"
+                         "\tsync\n"
+                         : "=&r" (old), "=&r" (t), "=m" (*p)
+                         : "r" (mask), "r" (p), "m" (*p)
+                         : "cc", "memory");
+    return ((old & mask) != 0);    
+}
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+    __asm__ __volatile__("\teieio\n": : :"memory");
+    x->lock = 0;
+}
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0);
+}
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *x)
+{
+    do 
+    {
+        gmx_atomic_memory_barrier(); 
+    }
+    while(gmx_spinlock_islocked(x));
+}
+#elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM))  && \
+        (defined(__powerpc__) || defined(__ppc__)))
+/* PowerPC using xlC inline assembly. 
+ * Recent versions of xlC (>=7.0) _partially_ support GCC inline assembly
+ * if you use the option -qasm=gcc but we have had to hack things a bit, in 
+ * particular when it comes to clobbered variables. Since this implementation
+ * _could_ be buggy, we have separated it from the known-to-be-working gcc
+ * one above.
+ */
+/* memory barrier - no idea how to create one with xlc! */
+#define gmx_atomic_memory_barrier()
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *    a, 
+                      int               i)
+{
+    int t;
+	__asm__ __volatile__("1:     lwarx   %0,0,%2 \n"
+                         "\t add     %0,%1,%0 \n"
+                         "\t stwcx.  %0,0,%2 \n"
+                         "\t bne-    1b \n"
+                         "\t isync \n"
+                         : "=&r" (t)
+						 : "r" (i), "r" (&a->value) );
+    return t;
+}
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int t;
+    __asm__ __volatile__("\t eieio\n"
+                         "1:     lwarx   %0,0,%2 \n"                         
+                         "\t add     %0,%1,%0 \n"
+                         "\t stwcx.  %0,0,%2 \n"
+                         "\t bne-    1b \n"
+                         "\t isync \n"
+                         : "=&r" (t)
+                         : "r" (i), "r" (&a->value));
+    return (t - i);    
+}
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *       a,
+                   int                  oldval,
+                   int                  newval)
+{
+    int prev;
+    __asm__ __volatile__ ("1:    lwarx   %0,0,%2 \n"
+                          "\t cmpw    0,%0,%3 \n"
+                          "\t bne     2f \n"
+                          "\t stwcx.  %4,0,%2 \n"
+                          "\t bne-    1b \n"
+                          "\t sync \n"
+                          "2: \n"
+                          : "=&r" (prev), "=m" (a->value)
+                          : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value));
+    return prev;
+}
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+    unsigned int tmp;
+    __asm__ __volatile__("\t b      1f \n"
+                         "2:      lwzx    %0,0,%1 \n"
+                         "\t cmpwi   0,%0,0 \n"
+                         "\t bne+    2b \n"
+                         "1:      lwarx   %0,0,%1 \n"
+                         "\t cmpwi   0,%0,0 \n"
+                         "\t bne-    2b \n"
+                         "\t stwcx.  %2,0,%1 \n"
+                         "\t bne-    2b \n"
+                         "\t isync\n"
+                         : "=&r"(tmp)
+                         : "r"(&x->lock), "r"(1));
+}
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+    unsigned int old, t;
+    unsigned int mask = 1;
+    volatile unsigned int *p = &x->lock;
+    __asm__ __volatile__("\t eieio\n"
+                         "1:      lwarx   %0,0,%4 \n"
+                         "\t or      %1,%0,%3 \n"
+                         "\t stwcx.  %1,0,%4 \n"
+                         "\t bne     1b \n"
+                         "\t sync \n"
+                         : "=&r" (old), "=&r" (t), "=m" (*p)
+                         : "r" (mask), "r" (p), "m" (*p));
+    return ((old & mask) != 0);    
+}
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+    __asm__ __volatile__("\t eieio \n");
+    x->lock = 0;
+}
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0);
+}
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    do 
+    {
+        gmx_atomic_memory_barrier();
+    }
+    while(gmx_spinlock_islocked(x));
+}
+#elif (defined(__ia64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER)))
+/* ia64 with GCC or Intel compilers. Since we need to define everything through
+* cmpxchg and fetchadd on ia64, we merge the different compilers and only provide 
+* different implementations for that single function. 
+* Documentation? Check the gcc/x86 section.
+*/
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+/* Compiler thingies */
+#ifdef __INTEL_COMPILER
+void __memory_barrier(void);
+int _InterlockedCompareExchange(volatile int *dest, int xchg, int comp);
+unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment);
+/* ia64 memory barrier */
+#  define gmx_atomic_memory_barrier() __memory_barrier()
+/* ia64 cmpxchg */
+#  define gmx_atomic_cmpxchg(a, oldval, newval) _InterlockedCompareExchange(&a->value,newval,oldval)
+/* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
+#  define gmx_ia64_fetchadd(a, inc)  __fetchadd4_rel(a, inc)
+#elif defined __GNUC__  
+/* ia64 memory barrier */
+#  define gmx_atomic_memory_barrier() asm volatile ("":::"memory")
+/* ia64 cmpxchg */
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *   a,
+                   int              oldval,
+                   int              newval)
+{
+    volatile int res;
+    asm volatile ("mov ar.ccv=%0;;" :: "rO"(oldval));
+    asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv":                    
+                  "=r"(res) : "r"(&a->value), "r"(newval) : "memory"); 
+    return res;
+}
+/* fetchadd, but on ia64 it only works with increments +/- 1,4,8,16 */
+#define gmx_ia64_fetchadd(a, inc)                                             \
+({  unsigned long res;                                                        \
+    asm volatile ("fetchadd4.rel %0=[%1],%2"                                  \
+                  : "=r"(res) : "r"(a), "i" (inc) : "memory");                \
+                  res;                                                        \
+})
+#else /* Unknown compiler */
+#  error Unknown ia64 compiler (not GCC or ICC) - modify gmx_thread.h!
+#endif
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      volatile int         i)
+{
+    volatile int oldval,newval;    
+    volatile int __i = i;
+    /* Use fetchadd if, and only if, the increment value can be determined
+     * at compile time (otherwise this check is optimized away) and it is
+     * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
+     */                         
+    if (__builtin_constant_p(i) &&
+        ( (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) ) )
+    {
+        oldval = gmx_ia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return newval;
+}
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     volatile int       i)
+{
+    volatile int oldval,newval;    
+    volatile int __i = i;
+    /* Use ia64 fetchadd if, and only if, the increment value can be determined
+     * at compile time (otherwise this check is optimized away) and it is
+     * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
+     */                         
+    if (__builtin_constant_p(i) &&
+        ( (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) ) )
+    {
+        oldval = gmx_ia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return oldval;
+}
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *   x)
+{
+    gmx_atomic_t *a = (gmx_atomic_t *) x;
+    unsigned long value;                                                 
+    value = gmx_atomic_cmpxchg(a, 0, 1);                             
+    if (value)                                                           
+    {                                                                    
+        do                                                               
+        {                                                                
+            while (a->value != 0)                                                 
+            {                                                            
+                gmx_atomic_memory_barrier();                             
+            }                                                            
+            value = gmx_atomic_cmpxchg(a, 0, 1);                       
+        }                                                                
+        while (value);                                                   
+    }                                                                    
+} 
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *   x)
+{
+    return (gmx_atomic_cmpxchg((gmx_atomic_t *)x, 0, 1) != 0);
+}
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    do
+    {
+        gmx_atomic_memory_barrier(); 
+        x->lock = 0;
+    } 
+    while (0);
+}
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (x->lock != 0);
+}
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    do 
+    {
+        gmx_atomic_memory_barrier();
+    }
+    while(gmx_spinlock_islocked(x));
+}
+#undef gmx_ia64_fetchadd
+#elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64)
+/* HP compiler on ia64 */
+#include <machine/sys/inline.h>
+#define gmx_atomic_memory_barrier() _Asm_mf()
+#define gmx_hpia64_fetchadd(a, i)                           \
+    _Asm_fetchadd((_Asm_fasz)_FASZ_W,(_Asm_sem)_SEM_REL,    \
+                  (UInt32*)a,(unsigned int) i,              \
+                  (_Asm_ldhint)LDHINT_NONE)
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *   a,
+                   int              oldval,
+                   int              newval)
+{
+    int ret;
+    _Asm_mov_to_ar((_Asm_app_reg)_AREG_CCV,(Uint32)oldval,                  
+                   (_Asm_fence)(_UP_CALL_FENCE | _UP_SYS_FENCE |         
+                                _DOWN_CALL_FENCE | _DOWN_SYS_FENCE));
+    ret = _Asm_cmpxchg((_Asm_sz)SZ_W,(_Asm_sem)_SEM_ACQ,(Uint32*)a,    
+                       (Uint32)newval,(_Asm_ldhint)_LDHINT_NONE);
+    return ret;
+}
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+static inline void 
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      int                  i)
+{
+    int old,new;    
+    int __i = i;
+    /* On HP-UX we don't know any macro to determine whether the increment
+     * is known at compile time, but hopefully the call uses something simple
+     * like a constant, and then the optimizer should be able to do the job.
+     */                         
+    if (  (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) )
+    {
+        oldval = gmx_hpia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return newval;
+}
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int oldval,newval;    
+    int __i = i;
+    /* On HP-UX we don't know any macro to determine whether the increment
+     * is known at compile time, but hopefully the call uses something simple
+     * like a constant, and then the optimizer should be able to do the job.
+     */                         
+    if (  (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) )
+    {
+        oldval = gmx_hpia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return oldval;
+}
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+static inline void
+gmx_spinlock_trylock(gmx_spinlock_t *x)
+{
+    int rc;
+    rc = _Asm_xchg((_Asm_sz)_SZ_W, (unsigned int *)x, 1        
+                    (_Asm_ldhit)_LDHINT_NONE);
+    return ( (rc>0) ? 1 : 0);
+}
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *x)
+{
+    int      status = 1;
+    do
+    {
+        if( *((unsigned int *)x->lock) == 0 ) 
+        {
+            status = gmx_spinlock_trylock(x);
+        }
+    } while( status != 0);
+}
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    _Asm_fetchadd((_Asm_fasz)_SZ_W,(_Asm_sem)_SEM_REL,                  
+                  (unsigned int *)x,-1,(_Asm_ldhint)_LDHINT_NONE);
+}
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0 );
+}
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    do
+    {
+        gmx_atomic_memory_barrier(); 
+    } 
+    while(gmx_spinlock_islocked(x));
+}
+#undef gmx_hpia64_fetchadd
+#elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
+/* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
+#include <windows.h>
+#define gmx_atomic_memory_barrier()
+typedef struct gmx_atomic
+{
+	LONG volatile	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+typedef struct gmx_spinlock
+{
+    LONG volatile      lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+#define gmx_atomic_read(a)  ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+#define gmx_atomic_fetch_add(a, i)  \
+    InterlockedExchangeAdd((LONG volatile *)a, (LONG) i)
+#define gmx_atomic_add_return(a, i)  \
+    ( i + InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) )
+#define gmx_atomic_cmpxchg(a, oldval, newval) \
+    InterlockedCompareExchange((LONG volatile *)a, (LONG) newval, (LONG) oldval)
+# define gmx_spinlock_lock(x)   \
+    while((InterlockedCompareExchange((LONG volatile *)&x, 1, 0))!=0)
+#define gmx_spinlock_trylock(x)   \
+    InterlockedCompareExchange((LONG volatile *)&x, 1, 0)
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    x->lock = 0;
+}
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (*(volatile signed char *)(&(x)->lock) != 0);
+}
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    while(gmx_spinlock_islocked(x))
+    {
+        Sleep(0);
+    }
+}
+#elif defined(__xlC__) && defined (_AIX)
+/* IBM xlC compiler on AIX */
+#include <sys/atomic_op.h>
+#define gmx_atomic_memory_barrier()
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+typedef struct gmx_spinlock
+{
+    volatile unsigned int      lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *    a,
+                   int               oldval,
+                   int               newval)
+{
+    int t;
+    if(__check_lock((atomic_p)&a->value, oldval, newval))
+    {
+        /* Not successful - value had changed in memory. Reload value. */
+        t = a->value;
+    }
+    else
+    {
+        /* replacement suceeded */
+        t = oldval;
+    }
+    return t;        
+}
+static inline void 
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      int                  i)
+{
+    int oldval,newval;    
+    do
+    {
+        oldval = gmx_atomic_read(a);
+        newval = oldval + i;
+    }
+    while(__check_lock((atomic_p)&a->value, oldval, newval));
+    return newval;
+}
+static inline void 
+gmx_atomic_fetch_add(gmx_atomic_t *       a, 
+                     int                  i)
+{
+    int oldval,newval;    
+    do
+    {
+        oldval = gmx_atomic_read(a);
+        newval = oldval + i;
+    }
+    while(__check_lock((atomic_p)&a->value, oldval, newval));
+    return oldval;
+}
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *   x)
+{
+    __clear_lock((atomic_p)x,0);
+}
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *   x)
+{
+    do
+    {
+        ;
+    }
+    while(__check_lock((atomic_p)x, 0, 1));
+}
+static inline void
+gmx_spinlock_trylock(gmx_spinlock_t *   x)
+{
+    /* Return 0 if we got the lock */
+    return (__check_lock((atomic_p)x, 0, 1) != 0)
+}
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    __clear_lock((atomic_p)x,0);
+}
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (*((atomic_p)x) != 0);
+}
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *    x)
+{
+    while(gmx_spinlock_islocked(x)) { ; } 
+}
+#else
+/* No atomic operations, use mutex fallback. Documentation is in x86 section */
+#define gmx_atomic_memory_barrier()
+/* System mutex used for locking to guarantee atomicity */
+static pthread_mutex_t
+gmx_atomic_mutex = PTHREAD_MUTEX_INITIALIZER;
+typedef struct gmx_atomic
+{
+	int	   value;
+}
+gmx_atomic_t;
+#define gmx_spinlock_t     pthread_mutex_t
+#  define GMX_SPINLOCK_INITIALIZER   PTHREAD_MUTEX_INITIALIZER
+/* Since mutexes guarantee memory barriers this works fine */
+#define gmx_atomic_read(a)   ((a)->value)
+static inline void
+gmx_atomic_set(gmx_atomic_t *   a, 
+               int              i)
+{
+    /* Mutexes here are necessary to guarantee memory visibility */
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    a->value = i;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+}
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *   a, 
+                      int              i)
+{
+    int t;
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    t = a->value + i;
+    a->value = t;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return t;
+}
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *   a,
+                     int              i)
+{
+    int old_value;
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    old_value  = a->value;
+    a->value   = old_value + i;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return old_value;
+}
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *           a, 
+                   int                      oldv,
+                   int                      newv)
+{
+    int t;
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    t = a->value;
+    if (t == oldv)
+    {
+        a->value = newv;
+    }
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return t;
+}
+#define gmx_spinlock_init(lock)       pthread_mutex_init(lock)
+#define gmx_spinlock_lock(lock)       pthread_mutex_lock(lock)
+#define gmx_spinlock_trylock(lock)    pthread_mutex_trylock(lock)
+#define gmx_spinlock_unlock(lock)     pthread_mutex_unlock(lock)
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    int rc;
+    if(gmx_spinlock_trylock(x) != 0)
+    {
+        /* It was locked */
+        return 1;
+    }
+    else
+    {
+        /* We just locked it */
+        gmx_spinlock_unlock(x);
+        return 0;
+    }
+}
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    int rc;
+    gmx_spinlock_lock(x);
+    /* Got the lock now, so the waiting is over */
+    gmx_spinlock_unlock(x);
+}
+#endif
+/*! \brief Spinlock-based barrier type
+ *
+ *  This barrier has the same functionality as the standard
+ *  gmx_thread_barrier_t, but since it is based on spinlocks
+ *  it provides faster synchronization at the cost of busy-waiting.
+ *
+ *  Variables of this type should be initialized by calling
+ *  gmx_spinlock_barrier_init() to set the number of threads
+ *  that should be synchronized.
+ */
+typedef struct gmx_spinlock_barrier
+{
+	gmx_atomic_t            count;     /*!< Number of threads remaining     */
+	int                     threshold; /*!< Total number of threads         */
+	volatile int            cycle;     /*!< Current cycle (alternating 0/1) */
+}
+gmx_spinlock_barrier_t;
+/*! \brief Initialize spinlock-based barrier
+ *
+ *  \param barrier  Pointer to _spinlock_ barrier. Note that this is not
+ *                  the same datatype as the full, thread based, barrier.
+ *  \param count    Number of threads to synchronize. All threads
+ *                  will be released after \a count calls to 
+ *                  gmx_spinlock_barrier_wait().  
+ */
+static inline void 
+gmx_spinlock_barrier_init(gmx_spinlock_barrier_t *         barrier,
+                          int                              count)
+{
+	barrier->threshold = count;
+	barrier->cycle     = 0;
+	gmx_atomic_set(&(barrier->count),count);
+}
+/*! \brief Perform busy-waiting barrier synchronization
+*
+*  This routine blocks until it has been called N times,
+*  where N is the count value the barrier was initialized with.
+*  After N total calls all threads return. The barrier automatically
+*  cycles, and thus requires another N calls to unblock another time.
+*
+*  Note that spinlock-based barriers are completely different from
+*  standard ones (using mutexes and condition variables), only the 
+*  functionality and names are similar.
+*
+*  \param barrier  Pointer to previously create barrier.
+*
+*  \return The last thread returns -1, all the others 0.
+*/
+static inline int
+gmx_spinlock_barrier_wait(gmx_spinlock_barrier_t *   barrier)
+{
+  int    cycle;
+  int    status;
+  /* We don't need to lock or use atomic ops here, since the cycle index 
+	* cannot change until after the last thread has performed the check
+	* further down. Further, they cannot reach this point in the next 
+	* barrier iteration until all of them have been released, and that 
+	* happens after the cycle value has been updated.
+	*
+	* No synchronization == fast synchronization.
+	*/
+  cycle = barrier->cycle;
+  /* Decrement the count atomically and check if it is zero.
+	* This will only be true for the last thread calling us.
+	*/
+  if( gmx_atomic_add_return( &(barrier->count), -1 ) == 0)
+  { 
+	gmx_atomic_set(&(barrier->count), barrier->threshold);
+	barrier->cycle = !barrier->cycle;
+	status = -1;
+  }
+  else
+  {
+	/* Wait until the last thread changes the cycle index.
+	* We are both using a memory barrier, and explicit
+	* volatile pointer cast to make sure the compiler
+	* doesn't try to be smart and cache the contents.
+	*/
+	do
+	{ 
+	  gmx_atomic_memory_barrier();
+	} 
+	while( *(volatile int *)(&(barrier->cycle)) == cycle);
+	status = 0;
+  }
+  return status;
+}
+#ifdef __cplusplus
+}
+#endif
+#endif /* _GMX_ATOMIC_H_ */
--- a/platforms/cpu/tests/CMakeLists.txt
+++ b/platforms/cpu/tests/CMakeLists.txt
@@ -11,7 +11,6 @@ IF( INCLUDE_SERIALIZATION )
    INCLUDE_DIRECTORIES(${OPENMM_DIR}/serialization/include)
    SET( SHARED_OPENMM_SERIALIZATION "OpenMMSerialization" )
 ENDIF( INCLUDE_SERIALIZATION )
-GET_PROPERTY(COMPILE_FLAGS GLOBAL PROPERTY COMPILE_FLAGS)
 # Automatically create tests using files named "Test*.cpp"
 FILE(GLOB TEST_PROGS "*Test*.cpp")
@@ -21,7 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS}" LINK_FLAGS "${COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -35,6 +35,7 @@
 #include "openmm/internal/AssertionUtilities.h"
 #include "openmm/internal/ThreadPool.h"
+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "CpuPlatform.h"
 #include "sfmt/SFMT.h"
@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
    const float boxSize[3] = {20.0f, 15.0f, 22.0f};
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
-    vector<float> positions(4*numParticles);
+    AlignedArray<float> positions(4*numParticles);
    for (int i = 0; i < 4*numParticles; i++)
        if (i%4 < 3)
            positions[i] = boxSize[i%4]*genrand_real2(sfmt);