Merge github.com:SimTk/openmm

19d2885a · Lee-Ping · 99ef4344 · 57a6768e · 19d2885a · 19d2885a
Commit 19d2885a authored Jan 23, 2014 by Lee-Ping
20 changed files
--- a/openmmapi/include/openmm/internal/ThreadPool.h
+++ b/openmmapi/include/openmm/internal/ThreadPool.h
+#ifndef OPENMM_THREAD_POOL_H_
+#define OPENMM_THREAD_POOL_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#define NOMINMAX
+#include "windowsExport.h"
+#include <pthread.h>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * A ThreadPool creates a set of worker threads that can be used to execute tasks in parallel.
+ * After creating a ThreadPool, call execute() to start a task running then waitForThreads()
+ * to block until all threads have finished.  You also can synchronize the threads in the middle
+ * of the task by having them call syncThreads().  In this case, the parent thread should call
+ * waitForThreads() an additional time; each call waits until all worker threads have reached the
+ * next syncThreads(), and the final call waits until they exit from the Task's execute() method.
+ * After calling waitForThreads() to block at a synchronization point, the parent thread should
+ * call resumeThreads() to instruct the worker threads to resume.
+ */
+class OPENMM_EXPORT ThreadPool {
+public:
+    class Task;
+    class ThreadData;
+    ThreadPool();
+    ~ThreadPool();
+    /**
+     * Get the number of worker threads in the pool.
+     */
+    int getNumThreads() const;
+    /**
+     * Execute a Task in parallel on the worker threads.
+     */
+    void execute(Task& task);
+    /**
+     * This is called by the worker threads to block until all threads have reached the same point
+     * and the master thread instructs them to continue by calling resumeThreads().
+     */
+    void syncThreads();
+    /**
+     * This is called by the master thread to wait until all threads have completed the Task.  Alternatively,
+     * if the threads call syncThreads(), this blocks until all threads have reached the synchronization point.
+     */
+    void waitForThreads();
+    /**
+     * Instruct the threads to resume running after blocking at a synchronization point.
+     */
+    void resumeThreads();
+private:
+    bool isDeleted;
+    int numThreads, waitCount;
+    std::vector<pthread_t> thread;
+    std::vector<ThreadData*> threadData;
+    pthread_cond_t startCondition, endCondition;
+    pthread_mutex_t lock;
+};
+
+/**
+ * This defines a task that can be executed in parallel by the worker threads.
+ */
+class OPENMM_EXPORT ThreadPool::Task {
+public:
+    /**
+     * Execute the task on each thread.
+     * 
+     * @param pool         the ThreadPool being used to execute the task
+     * @param threadIndex  the index of the thread invoking this method
+     */
+    virtual void execute(ThreadPool& pool, int threadIndex) = 0;
+};
+
+} // namespace OpenMM
+
+#endif // OPENMM_THREAD_POOL_H_
--- a/openmmapi/include/openmm/internal/hardware.h
+++ b/openmmapi/include/openmm/internal/hardware.h
@@ -44,6 +44,7 @@
   #include <dlfcn.h>
 #else
   #ifdef WIN32
+      #define NOMINMAX
      #include <windows.h>
   #else
      #include <dlfcn.h>

--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -55,58 +55,62 @@ public:
        return val;
    }
    float operator[](int i) const {
-        int resultBits = _mm_extract_ps(val, i);
-        return *((float*) &resultBits);
+        float result[4];
+        store(result);
+        return result[i];
    }
    void store(float* v) const {
        _mm_storeu_ps(v, val);
    }
-    fvec4 operator+(fvec4 other) const {
+    fvec4 operator+(const fvec4& other) const {
        return _mm_add_ps(val, other);
    }
-    fvec4 operator-(fvec4 other) const {
+    fvec4 operator-(const fvec4& other) const {
        return _mm_sub_ps(val, other);
    }
-    fvec4 operator*(fvec4 other) const {
+    fvec4 operator*(const fvec4& other) const {
        return _mm_mul_ps(val, other);
    }
-    fvec4 operator/(fvec4 other) const {
+    fvec4 operator/(const fvec4& other) const {
        return _mm_div_ps(val, other);
    }
-    void operator+=(fvec4 other) {
+    void operator+=(const fvec4& other) {
        val = _mm_add_ps(val, other);
    }
-    void operator-=(fvec4 other) {
+    void operator-=(const fvec4& other) {
        val = _mm_sub_ps(val, other);
    }
-    void operator*=(fvec4 other) {
+    void operator*=(const fvec4& other) {
        val = _mm_mul_ps(val, other);
    }
-    void operator/=(fvec4 other) {
+    void operator/=(const fvec4& other) {
        val = _mm_div_ps(val, other);
    }
    fvec4 operator-() const {
        return _mm_sub_ps(_mm_set1_ps(0.0f), val);
    }
-    fvec4 operator&(fvec4 other) const {
+    fvec4 operator&(const fvec4& other) const {
        return _mm_and_ps(val, other);
    }
-    fvec4 operator==(fvec4 other) const {
+    fvec4 operator|(const fvec4& other) const {
+        return _mm_or_ps(val, other);
+    }
+    fvec4 operator==(const fvec4& other) const {
        return _mm_cmpeq_ps(val, other);
    }
-    fvec4 operator!=(fvec4 other) const {
+    fvec4 operator!=(const fvec4& other) const {
        return _mm_cmpneq_ps(val, other);
    }
-    fvec4 operator>(fvec4 other) const {
+    fvec4 operator>(const fvec4& other) const {
        return _mm_cmpgt_ps(val, other);
    }
-    fvec4 operator<(fvec4 other) const {
+    fvec4 operator<(const fvec4& other) const {
        return _mm_cmplt_ps(val, other);
    }
-    fvec4 operator>=(fvec4 other) const {
+    fvec4 operator>=(const fvec4& other) const {
        return _mm_cmpge_ps(val, other);
    }
-    fvec4 operator<=(fvec4 other) const {
+    fvec4 operator<=(const fvec4& other) const {
        return _mm_cmple_ps(val, other);
    }
    operator ivec4() const;
@@ -128,38 +132,58 @@ public:
        return val;
    }
    int operator[](int i) const {
-        return _mm_extract_epi32(val, i);
+        int result[4];
+        store(result);
+        return result[i];
    }
    void store(int* v) const {
        _mm_storeu_si128((__m128i*) v, val);
    }
-    ivec4 operator+(ivec4 other) const {
+    ivec4 operator+(const ivec4& other) const {
        return _mm_add_epi32(val, other);
    }
-    ivec4 operator-(ivec4 other) const {
+    ivec4 operator-(const ivec4& other) const {
        return _mm_sub_epi32(val, other);
    }
-    ivec4 operator*(ivec4 other) const {
+    ivec4 operator*(const ivec4& other) const {
        return _mm_mul_epi32(val, other);
    }
-    void operator+=(ivec4 other) {
+    void operator+=(const ivec4& other) {
        val = _mm_add_epi32(val, other);
    }
-    void operator-=(ivec4 other) {
+    void operator-=(const ivec4& other) {
        val = _mm_sub_epi32(val, other);
    }
-    void operator*=(ivec4 other) {
+    void operator*=(const ivec4& other) {
        val = _mm_mul_epi32(val, other);
    }
    ivec4 operator-() const {
        return _mm_sub_epi32(_mm_set1_epi32(0), val);
    }
-    ivec4 operator&(ivec4 other) const {
+    ivec4 operator&(const ivec4& other) const {
        return _mm_and_si128(val, other);
    }
-    ivec4 operator==(ivec4 other) const {
+    ivec4 operator|(const ivec4& other) const {
+        return _mm_or_si128(val, other);
+    }
+    ivec4 operator==(const ivec4& other) const {
        return _mm_cmpeq_epi32(val, other);
    }
+    ivec4 operator!=(const ivec4& other) const {
+        return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
+    }
+    ivec4 operator>(const ivec4& other) const {
+        return _mm_cmpgt_epi32(val, other);
+    }
+    ivec4 operator<(const ivec4& other) const {
+        return _mm_cmplt_epi32(val, other);
+    }
+    ivec4 operator>=(const ivec4& other) const {
+        return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
+    }
+    ivec4 operator<=(const ivec4& other) const {
+        return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
+    }
    operator fvec4() const;
 };

@@ -175,74 +199,88 @@ inline ivec4::operator fvec4() const {

 // Functions that operate on fvec4s.

-static inline fvec4 floor(fvec4 v) {
+static inline fvec4 floor(const fvec4& v) {
    return fvec4(_mm_floor_ps(v.val));
 }

-static inline fvec4 ceil(fvec4 v) {
+static inline fvec4 ceil(const fvec4& v) {
    return fvec4(_mm_ceil_ps(v.val));
 }

-static inline fvec4 round(fvec4 v) {
+static inline fvec4 round(const fvec4& v) {
    return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
 }

-static inline fvec4 min(fvec4 v1, fvec4 v2) {
+static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
    return fvec4(_mm_min_ps(v1.val, v2.val));
 }

-static inline fvec4 max(fvec4 v1, fvec4 v2) {
+static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
    return fvec4(_mm_max_ps(v1.val, v2.val));
 }

-static inline fvec4 abs(fvec4 v) {
+static inline fvec4 abs(const fvec4& v) {
    static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    return fvec4(_mm_and_ps(v.val, mask));
 }

-static inline fvec4 sqrt(fvec4 v) {
+static inline fvec4 sqrt(const fvec4& v) {
    return fvec4(_mm_sqrt_ps(v.val));
 }

-static inline float dot3(fvec4 v1, fvec4 v2) {
+static inline float dot3(const fvec4& v1, const fvec4& v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
 }

-static inline float dot4(fvec4 v1, fvec4 v2) {
+static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
 }

+static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
+    _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
+}
+
 // Functions that operate on ivec4s.

-static inline ivec4 min(ivec4 v1, ivec4 v2) {
+static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
    return ivec4(_mm_min_epi32(v1.val, v2.val));
 }

-static inline ivec4 max(ivec4 v1, ivec4 v2) {
+static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
    return ivec4(_mm_max_epi32(v1.val, v2.val));
 }

-static inline ivec4 abs(ivec4 v) {
+static inline ivec4 abs(const ivec4& v) {
    return ivec4(_mm_abs_epi32(v.val));
 }

+static inline bool any(const ivec4& v) {
+    return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
+}
+
 // Mathematical operators involving a scalar and a vector.

-static inline fvec4 operator+(float v1, fvec4 v2) {
+static inline fvec4 operator+(float v1, const fvec4& v2) {
    return fvec4(v1)+v2;
 }

-static inline fvec4 operator-(float v1, fvec4 v2) {
+static inline fvec4 operator-(float v1, const fvec4& v2) {
    return fvec4(v1)-v2;
 }

-static inline fvec4 operator*(float v1, fvec4 v2) {
+static inline fvec4 operator*(float v1, const fvec4& v2) {
    return fvec4(v1)*v2;
 }

-static inline fvec4 operator/(float v1, fvec4 v2) {
+static inline fvec4 operator/(float v1, const fvec4& v2) {
    return fvec4(v1)/v2;
 }

+// Operations for blending fvec4s based on an ivec4.
+
+static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
+    return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
+}
+
 #endif /*OPENMM_VECTORIZE_H_*/

--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
+#ifndef OPENMM_VECTORIZE8_H_
+#define OPENMM_VECTORIZE8_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "vectorize.h"
+#include <immintrin.h>
+
+// This file defines classes and functions to simplify vectorizing code with AVX.
+
+class ivec8;
+
+/**
+ * An eight element vector of floats.
+ */
+class fvec8 {
+public:
+    __m256 val;
+    
+    fvec8() {}
+    fvec8(float v) : val(_mm256_set1_ps(v)) {}
+    fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
+    fvec8(__m256 v) : val(v) {}
+    fvec8(const float* v) : val(_mm256_loadu_ps(v)) {}
+    operator __m256() const {
+        return val;
+    }
+    fvec4 lowerVec() const {
+        return _mm256_castps256_ps128(val);
+    }
+    fvec4 upperVec() const {
+        return _mm256_extractf128_ps(val, 1);
+    }
+    void store(float* v) const {
+        _mm256_storeu_ps(v, val);
+    }
+    fvec8 operator+(const fvec8& other) const {
+        return _mm256_add_ps(val, other);
+    }
+    fvec8 operator-(const fvec8& other) const {
+        return _mm256_sub_ps(val, other);
+    }
+    fvec8 operator*(const fvec8& other) const {
+        return _mm256_mul_ps(val, other);
+    }
+    fvec8 operator/(const fvec8& other) const {
+        return _mm256_div_ps(val, other);
+    }
+    void operator+=(const fvec8& other) {
+        val = _mm256_add_ps(val, other);
+    }
+    void operator-=(const fvec8& other) {
+        val = _mm256_sub_ps(val, other);
+    }
+    void operator*=(const fvec8& other) {
+        val = _mm256_mul_ps(val, other);
+    }
+    void operator/=(const fvec8& other) {
+        val = _mm256_div_ps(val, other);
+    }
+    fvec8 operator-() const {
+        return _mm256_sub_ps(_mm256_set1_ps(0.0f), val);
+    }
+    fvec8 operator&(const fvec8& other) const {
+        return _mm256_and_ps(val, other);
+    }
+    fvec8 operator|(const fvec8& other) const {
+        return _mm256_or_ps(val, other);
+    }
+    fvec8 operator==(const fvec8& other) const {
+        return _mm256_cmp_ps(val, other, _CMP_EQ_OQ);
+    }
+    fvec8 operator!=(const fvec8& other) const {
+        return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ);
+    }
+    fvec8 operator>(const fvec8& other) const {
+        return _mm256_cmp_ps(val, other, _CMP_GT_OQ);
+    }
+    fvec8 operator<(const fvec8& other) const {
+        return _mm256_cmp_ps(val, other, _CMP_LT_OQ);
+    }
+    fvec8 operator>=(const fvec8& other) const {
+        return _mm256_cmp_ps(val, other, _CMP_GE_OQ);
+    }
+    fvec8 operator<=(const fvec8& other) const {
+        return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
+    }
+    operator ivec8() const;
+};
+
+/**
+ * An eight element vector of ints.
+ */
+class ivec8 {
+public:
+    __m256i val;
+    
+    ivec8() {}
+    ivec8(int v) : val(_mm256_set1_epi32(v)) {}
+    ivec8(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) : val(_mm256_set_epi32(v8, v7, v6, v5, v4, v3, v2, v1)) {}
+    ivec8(__m256i v) : val(v) {}
+    ivec8(const int* v) : val(_mm256_loadu_si256((const __m256i*) v)) {}
+    operator __m256i() const {
+        return val;
+    }
+    ivec4 lowerVec() const {
+        return _mm256_castsi256_si128(val);
+    }
+    ivec4 upperVec() const {
+        return _mm256_extractf128_si256(val, 1);
+    }
+    void store(int* v) const {
+        _mm256_storeu_si256((__m256i*) v, val);
+    }
+    ivec8 operator&(const ivec8& other) const {
+        return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
+    }
+    ivec8 operator|(const ivec8& other) const {
+        return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
+    }
+    operator fvec8() const;
+};
+
+// Conversion operators.
+
+inline fvec8::operator ivec8() const {
+    return _mm256_cvttps_epi32(val);
+}
+
+inline ivec8::operator fvec8() const {
+    return _mm256_cvtepi32_ps(val);
+}
+
+// Functions that operate on fvec8s.
+
+static inline fvec8 floor(const fvec8& v) {
+    return fvec8(_mm256_round_ps(v.val, 0x09));
+}
+
+static inline fvec8 ceil(const fvec8& v) {
+    return fvec8(_mm256_round_ps(v.val, 0x0A));
+}
+
+static inline fvec8 round(const fvec8& v) {
+    return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
+}
+
+static inline fvec8 min(const fvec8& v1, const fvec8& v2) {
+    return fvec8(_mm256_min_ps(v1.val, v2.val));
+}
+
+static inline fvec8 max(const fvec8& v1, const fvec8& v2) {
+    return fvec8(_mm256_max_ps(v1.val, v2.val));
+}
+
+static inline fvec8 abs(const fvec8& v) {
+    static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+    return fvec8(_mm256_and_ps(v.val, mask));
+}
+
+static inline fvec8 sqrt(const fvec8& v) {
+    return fvec8(_mm256_sqrt_ps(v.val));
+}
+
+static inline float dot8(const fvec8& v1, const fvec8& v2) {
+    fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
+    return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
+}
+
+static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
+    fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
+    fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
+    _MM_TRANSPOSE4_PS(i1, i2, i3, i4);
+    _MM_TRANSPOSE4_PS(i5, i6, i7, i8);
+#ifdef _MSC_VER
+    // Visual Studio has a bug in _mm256_castps128_ps256, so we have to use the more expensive _mm256_insertf128_ps.
+    out1 = _mm256_insertf128_ps(out1, i1, 0);
+    out2 = _mm256_insertf128_ps(out2, i2, 0);
+    out3 = _mm256_insertf128_ps(out3, i3, 0);
+    out4 = _mm256_insertf128_ps(out4, i4, 0);
+#else
+    out1 = _mm256_castps128_ps256(i1);
+    out2 = _mm256_castps128_ps256(i2);
+    out3 = _mm256_castps128_ps256(i3);
+    out4 = _mm256_castps128_ps256(i4);
+#endif
+    out1 = _mm256_insertf128_ps(out1, i5, 1);
+    out2 = _mm256_insertf128_ps(out2, i6, 1);
+    out3 = _mm256_insertf128_ps(out3, i7, 1);
+    out4 = _mm256_insertf128_ps(out4, i8, 1);
+}
+
+static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
+    out1 = in1.lowerVec();
+    out2 = in2.lowerVec();
+    out3 = in3.lowerVec();
+    out4 = in4.lowerVec();
+    _MM_TRANSPOSE4_PS(out1, out2, out3, out4);
+    out5 = in1.upperVec();
+    out6 = in2.upperVec();
+    out7 = in3.upperVec();
+    out8 = in4.upperVec();
+    _MM_TRANSPOSE4_PS(out5, out6, out7, out8);
+}
+
+// Functions that operate on ivec8s.
+
+static inline bool any(const ivec8& v) {
+    return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
+}
+
+// Mathematical operators involving a scalar and a vector.
+
+static inline fvec8 operator+(float v1, const fvec8& v2) {
+    return fvec8(v1)+v2;
+}
+
+static inline fvec8 operator-(float v1, const fvec8& v2) {
+    return fvec8(v1)-v2;
+}
+
+static inline fvec8 operator*(float v1, const fvec8& v2) {
+    return fvec8(v1)*v2;
+}
+
+static inline fvec8 operator/(float v1, const fvec8& v2) {
+    return fvec8(v1)/v2;
+}
+
+// Operations for blending fvec8s based on an ivec8.
+
+static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const ivec8& mask) {
+    return fvec8(_mm256_blendv_ps(v1.val, v2.val, _mm256_castsi256_ps(mask.val)));
+}
+
+#endif /*OPENMM_VECTORIZE8_H_*/
--- a/openmmapi/src/ContextImpl.cpp
+++ b/openmmapi/src/ContextImpl.cpp
@@ -39,6 +39,7 @@
 #include "openmm/State.h"
 #include "openmm/VirtualSite.h"
 #include "openmm/Context.h"
+#include <algorithm>
 #include <iostream>
 #include <map>
 #include <utility>
@@ -75,11 +76,27 @@ ContextImpl::ContextImpl(Context& owner, const System& system, Integrator& integ
            throw OpenMMException("A constraint cannot involve a massless particle");
    }
    
+    // Validate the list of properties.
+
+    const vector<string>& platformProperties = platform->getPropertyNames();
+    for (map<string, string>::const_iterator iter = properties.begin(); iter != properties.end(); ++iter) {
+        bool valid = false;
+        for (int i = 0; i < (int) platformProperties.size(); i++)
+            if (platformProperties[i] == iter->first) {
+                valid = true;
+                break;
+            }
+        if (!valid)
+            throw OpenMMException("Illegal property name: "+iter->first);
+    }
+    
    // Find the list of kernels required.
    
    vector<string> kernelNames;
    kernelNames.push_back(CalcForcesAndEnergyKernel::Name());
    kernelNames.push_back(UpdateStateDataKernel::Name());
+    kernelNames.push_back(ApplyConstraintsKernel::Name());
+    kernelNames.push_back(VirtualSitesKernel::Name());
    for (int i = 0; i < system.getNumForces(); ++i) {
        forceImpls.push_back(system.getForce(i).createImpl());
        map<string, double> forceParameters = forceImpls[forceImpls.size()-1]->getDefaultParameters();
@@ -90,14 +107,40 @@ ContextImpl::ContextImpl(Context& owner, const System& system, Integrator& integ
    hasInitializedForces = true;
    vector<string> integratorKernels = integrator.getKernelNames();
    kernelNames.insert(kernelNames.begin(), integratorKernels.begin(), integratorKernels.end());
-    if (platform == 0)
-        this->platform = platform = &Platform::findPlatform(kernelNames);
-    else if (!platform->supportsKernels(kernelNames))
-        throw OpenMMException("Specified a Platform for a Context which does not support all required kernels");
+    
+    // Select a platform to use.
+    
+    vector<pair<double, Platform*> > candidatePlatforms;
+    if (platform == NULL) {
+        for (int i = 0; i < Platform::getNumPlatforms(); i++) {
+            Platform& p = Platform::getPlatform(i);
+            if (p.supportsKernels(kernelNames))
+                candidatePlatforms.push_back(make_pair(p.getSpeed(), &p));
+        }
+        if (candidatePlatforms.size() == 0)
+            throw OpenMMException("No Platform supports all the requested kernels");
+        sort(candidatePlatforms.begin(), candidatePlatforms.end());
+    }
+    else {
+        if (!platform->supportsKernels(kernelNames))
+            throw OpenMMException("Specified a Platform for a Context which does not support all required kernels");
+        candidatePlatforms.push_back(make_pair(platform->getSpeed(), platform));
+    }
+    for (int i = candidatePlatforms.size()-1; i >= 0; i--) {
+        try {
+            this->platform = platform = candidatePlatforms[i].second;
+            platform->contextCreated(*this, properties);
+            break;
+        }
+        catch (...) {
+            if (i > 0)
+                continue;
+            throw;
+        }
+    }
    
    // Create and initialize kernels and other objects.
    
-    platform->contextCreated(*this, properties);
    initializeForcesKernel = platform->createKernel(CalcForcesAndEnergyKernel::Name(), *this);
    initializeForcesKernel.getAs<CalcForcesAndEnergyKernel>().initialize(system);
    updateStateDataKernel = platform->createKernel(UpdateStateDataKernel::Name(), *this);

--- a/openmmapi/src/ThreadPool.cpp
+++ b/openmmapi/src/ThreadPool.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/internal/ThreadPool.h"
+#include "openmm/internal/hardware.h"
+
+using namespace std;
+
+namespace OpenMM {
+
+class ThreadPool::ThreadData {
+public:
+    ThreadData(ThreadPool& owner, int index) : owner(owner), index(index), isDeleted(false) {
+    }
+    ThreadPool& owner;
+    int index;
+    bool isDeleted;
+    Task* currentTask;
+};
+
+static void* threadBody(void* args) {
+    ThreadPool::ThreadData& data = *reinterpret_cast<ThreadPool::ThreadData*>(args);
+    while (true) {
+        // Wait for the signal to start running.
+        
+        data.owner.syncThreads();
+        if (data.isDeleted)
+            break;
+        data.currentTask->execute(data.owner, data.index);
+    }
+    delete &data;
+    return 0;
+}
+
+ThreadPool::ThreadPool() {
+    numThreads = getNumProcessors();
+    pthread_cond_init(&startCondition, NULL);
+    pthread_cond_init(&endCondition, NULL);
+    pthread_mutex_init(&lock, NULL);
+    thread.resize(numThreads);
+    pthread_mutex_lock(&lock);
+    waitCount = 0;
+    for (int i = 0; i < numThreads; i++) {
+        ThreadData* data = new ThreadData(*this, i);
+        data->isDeleted = false;
+        threadData.push_back(data);
+        pthread_create(&thread[i], NULL, threadBody, data);
+    }
+    while (waitCount < numThreads)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+ThreadPool::~ThreadPool() {
+    for (int i = 0; i < (int) threadData.size(); i++)
+        threadData[i]->isDeleted = true;
+    pthread_mutex_lock(&lock);
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+    for (int i = 0; i < (int) thread.size(); i++)
+        pthread_join(thread[i], NULL);
+    pthread_mutex_destroy(&lock);
+    pthread_cond_destroy(&startCondition);
+    pthread_cond_destroy(&endCondition);
+}
+
+int ThreadPool::getNumThreads() const {
+    return numThreads;
+}
+
+void ThreadPool::execute(Task& task) {
+    for (int i = 0; i < (int) threadData.size(); i++)
+        threadData[i]->currentTask = &task;
+    resumeThreads();
+}
+
+void ThreadPool::syncThreads() {
+    pthread_mutex_lock(&lock);
+    waitCount++;
+    pthread_cond_signal(&endCondition);
+    pthread_cond_wait(&startCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+void ThreadPool::waitForThreads() {
+    pthread_mutex_lock(&lock);
+    while (waitCount < numThreads)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+void ThreadPool::resumeThreads() {
+    pthread_mutex_lock(&lock);
+    waitCount = 0;
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+}
+
+} // namespace OpenMM
--- a/platforms/cpu/CMakeLists.txt
+++ b/platforms/cpu/CMakeLists.txt
@@ -14,10 +14,6 @@
 #   libOpenMMCPU_static[_d].a
 #----------------------------------------------------

-IF (APPLE)
-    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
-ENDIF (APPLE)
-
 SUBDIRS (tests)

 # The source is organized into subdirectories, but we handle them all from
@@ -36,9 +32,9 @@ SET(STATIC_TARGET ${OPENMMCPU_LIBRARY_NAME}_static)

 # Ensure that debug libraries have "_d" appended to their names.
 # CMake gets this right on Windows automatically with this definition.
-IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+IF (MSVC)
    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
-ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+ENDIF (MSVC)

 # But on Unix or Cygwin we have to add the suffix manually
 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)

--- a/platforms/cpu/include/AlignedArray.h
+++ b/platforms/cpu/include/AlignedArray.h
+#ifndef OPENMM_ALIGNEDARRAY_H_
+#define OPENMM_ALIGNEDARRAY_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+namespace OpenMM {
+
+/**
+ * This class represents an array in memory whose starting point is guaranteed to
+ * be aligned with a 16 byte boundary.  This can improve the performance of vectorized
+ * code, since loads and stores are more efficient.
+ */
+template <class T>
+class AlignedArray {
+public:
+    /**
+     * Default constructor, to allow AlignedArrays to be used inside collections.
+     */
+    AlignedArray() : dataSize(0), baseData(0), data(0) {
+    }
+    /**
+     * Create an Aligned array that contains a specified number of elements.
+     */
+    AlignedArray(int size) {
+        allocate(size);
+    }
+    ~AlignedArray() {
+        if (baseData != 0)
+            delete[] baseData;
+    }
+    /**
+     * Get the number of elements in the array.
+     */
+    int size() const {
+        return dataSize;
+    }
+    /**
+     * Change the size of the array.  This may cause all contents to be lost.
+     */
+    void resize(int size) {
+        if (dataSize == size)
+            return;
+        if (baseData != 0)
+            delete[] baseData;
+        allocate(size);
+    }
+    /**
+     * Get a reference to an element of the array.
+     */
+    T& operator[](int i) {
+        return data[i];
+    }
+    /**
+     * Get a const reference to an element of the array.
+     */
+    const T& operator[](int i) const {
+        return data[i];
+    }
+private:
+    void allocate(int size) {
+        dataSize = size;
+        baseData = new char[size*sizeof(T)+16];
+        char* offsetData = baseData+15;
+        offsetData -= (long long)offsetData&0xF;
+        data = (T*) offsetData;
+    }
+    int dataSize;
+    char* baseData;
+    T* data;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_ALIGNEDARRAY_H_*/
+
--- a/platforms/cpu/include/CpuBondForce.h
+++ b/platforms/cpu/include/CpuBondForce.h
+#ifndef OPENMM_CPUBONDFORCE_H_
+#define OPENMM_CPUBONDFORCE_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2014 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "ReferenceBondIxn.h"
+#include "windowsExportCpu.h"
+#include "openmm/internal/ThreadPool.h"
+#include <list>
+#include <set>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This class parallelizes the calculation of bonded forces.
+ */
+class OPENMM_EXPORT_CPU CpuBondForce {
+public:
+    class ComputeForceTask;
+    CpuBondForce();
+    /**
+     * Analyze the set of bonds and decide which to compute with each thread.
+     */
+    void initialize(int numAtoms, int numBonds, int numAtomsPerBond, int** bondAtoms, ThreadPool& threads);
+    /**
+     * Compute the forces from all bonds.
+     */
+    void calculateForce(std::vector<OpenMM::RealVec>& atomCoordinates, RealOpenMM** parameters, std::vector<OpenMM::RealVec>& forces, 
+            RealOpenMM* totalEnergy, ReferenceBondIxn& referenceBondIxn);
+    /**
+     * This routine contains the code executed by each thread.
+     */
+    void threadComputeForce(ThreadPool& threads, int threadIndex, std::vector<OpenMM::RealVec>& atomCoordinates, RealOpenMM** parameters,
+            std::vector<OpenMM::RealVec>& forces, RealOpenMM* totalEnergy, ReferenceBondIxn& referenceBondIxn);
+private:
+    bool canAssignBond(int bond, int thread, std::vector<int>& atomThread);
+    void assignBond(int bond, int thread, std::vector<int>& atomThread, std::vector<int>& bondThread, std::vector<std::set<int> >& atomBonds, std::list<int>& candidateBonds);
+    int numBonds, numAtomsPerBond;
+    int** bondAtoms;
+    ThreadPool* threads;
+    std::vector<std::vector<int> > threadBonds;
+    std::vector<int> extraBonds;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPUBONDFORCE_H_*/
--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
+#define OPENMM_CPU_GBSAOBC_FORCE_H__
+
+#include "AlignedArray.h"
+#include "openmm/internal/ThreadPool.h"
+#include "openmm/internal/vectorize.h"
+#include <set>
+#include <utility>
+#include <vector>
+
+namespace OpenMM {
+
+class CpuGBSAOBCForce {
+public:
+    class ComputeTask;
+    CpuGBSAOBCForce();
+
+    /**
+     * Set the force to use a cutoff.
+     * 
+     * @param distance    the cutoff distance
+     */
+    void setUseCutoff(float distance);
+
+    /**
+     * 
+     * Set the force to use periodic boundary conditions.  This requires that a cutoff has
+     * already been set, and the smallest side of the periodic box is at least twice the cutoff
+     * distance.
+     *
+     * @param boxSize             the X, Y, and Z widths of the periodic box
+     */
+    void setPeriodic(float* periodicBoxSize);
+
+    /**
+     * Set the solute dielectric constant.
+     */
+    void setSoluteDielectric(float dielectric);
+
+    /**
+     * Set the solvent dielectric constant.
+     */
+    void setSolventDielectric(float dielectric);
+    
+    /**
+     * Get the per-particle parameters (offset radius, scaled radius).
+     */
+    const std::vector<std::pair<float, float> >& getParticleParameters() const;
+    
+    /**
+     * Set the per-particle parameters (offset radius, scaled radius).
+     */
+    void setParticleParameters(const std::vector<std::pair<float, float> >& params);
+
+    /**
+     * 
+     * Calculate LJ Coulomb pair ixn
+     *
+     * @param posq             atom coordinates and charges
+     * @param forces           force array (forces added)
+     * @param totalEnergy      total energy
+     * @param threads          the thread pool to use
+     */
+    void computeForce(const AlignedArray<float>& posq, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+
+    /**
+     * This routine contains the code executed by each thread.
+     */
+    void threadComputeForce(ThreadPool& threads, int threadIndex);
+
+private:
+    bool cutoff;
+    bool periodic;
+    float periodicBoxSize[3];
+    float cutoffDistance, soluteDielectric, solventDielectric;
+    std::vector<std::pair<float, float> > particleParams;        
+    std::vector<float> bornRadii;
+    std::vector<std::vector<float> > threadBornForces;
+    std::vector<float> obcChain;
+    std::vector<double> threadEnergy;
+    std::vector<float> logTable;
+    float logDX, logDXInv;
+    // The following variables are used to make information accessible to the individual threads.
+    float const* posq;
+    std::vector<AlignedArray<float> >* threadForce;
+    bool includeEnergy;
+    void* atomicCounter;
+  
+    static const int NUM_TABLE_POINTS;
+    static const float TABLE_MIN;
+    static const float TABLE_MAX;
+
+    /**
+     * Compute the displacement and squared distance between a collection of points, optionally using
+     * periodic boundary conditions.
+     */
+    void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+    
+    /**
+     * Evaluate log(x) using a lookup table for speed.
+     */
+    fvec4 fastLog(const fvec4& x);
+};
+
+} // namespace OpenMM
+
+// ---------------------------------------------------------------------------------------
+
+#endif // OPENMM_CPU_GBSAOBC_FORCE_H__
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -32,22 +32,147 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#include "CpuPlatform.h"
+#include "CpuBondForce.h"
+#include "CpuGBSAOBCForce.h"
+#include "CpuLangevinDynamics.h"
 #include "CpuNeighborList.h"
 #include "CpuNonbondedForce.h"
+#include "CpuPlatform.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"

 namespace OpenMM {

+/**
+ * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
+ * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
+ * necessary work at the end to determine the final results.
+ */
+class CpuCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
+public:
+    class InitForceTask;
+    class SumForceTask;
+    CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context);
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
+     * any ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+    /**
+     * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
+     * every ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
+     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
+     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
+     */
+    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+private:
+    CpuPlatform::PlatformData& data;
+    Kernel referenceKernel;
+};
+
+/**
+ * This kernel is invoked by PeriodicTorsionForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CpuCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
+public:
+    CpuCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) :
+            CalcPeriodicTorsionForceKernel(name, platform), data(data), torsionIndexArray(NULL), torsionParamArray(NULL) {
+    }
+    ~CpuCalcPeriodicTorsionForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the PeriodicTorsionForce this kernel will be used for
+     */
+    void initialize(const System& system, const PeriodicTorsionForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the PeriodicTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
+private:
+    CpuPlatform::PlatformData& data;
+    int numTorsions;
+    int **torsionIndexArray;
+    RealOpenMM **torsionParamArray;
+    CpuBondForce bondForce;
+};
+
+/**
+ * This kernel is invoked by RBTorsionForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CpuCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
+public:
+    CpuCalcRBTorsionForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) :
+            CalcRBTorsionForceKernel(name, platform), data(data), torsionIndexArray(NULL), torsionParamArray(NULL) {
+    }
+    ~CpuCalcRBTorsionForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the RBTorsionForce this kernel will be used for
+     */
+    void initialize(const System& system, const RBTorsionForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the RBTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
+private:
+    CpuPlatform::PlatformData& data;
+    int numTorsions;
+    int **torsionIndexArray;
+    RealOpenMM **torsionParamArray;
+    CpuBondForce bondForce;
+};
+
 /**
 * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
 */
 class CpuCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
-    CpuCalcNonbondedForceKernel(std::string name, const Platform& platform) : CalcNonbondedForceKernel(name, platform),
-            bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false) {
-    }
+    CpuCalcNonbondedForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data);
    ~CpuCalcNonbondedForceKernel();
    /**
     * Initialize the kernel.
@@ -76,6 +201,7 @@ public:
    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
 private:
    class PmeIO;
+    CpuPlatform::PlatformData& data;
    int numParticles, num14;
    int **bonded14IndexArray;
    double **bonded14ParamArray;
@@ -84,15 +210,88 @@ private:
    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme;
    std::vector<std::set<int> > exclusions;
    std::vector<std::pair<float, float> > particleParams;
-    std::vector<float> posq;
-    std::vector<float> forces;
    std::vector<RealVec> lastPositions;
    NonbondedMethod nonbondedMethod;
-    CpuNeighborList neighborList;
-    CpuNonbondedForce nonbonded;
+    CpuNeighborList* neighborList;
+    CpuNonbondedForce* nonbonded;
    Kernel optimizedPme;
 };

+/**
+ * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
+ */
+class CpuCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
+public:
+    CpuCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcGBSAOBCForceKernel(name, platform),
+            data(data) {
+    }
+    ~CpuCalcGBSAOBCForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the GBSAOBCForce this kernel will be used for
+     */
+    void initialize(const System& system, const GBSAOBCForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the GBSAOBCForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
+private:
+    CpuPlatform::PlatformData& data;
+    std::vector<std::pair<float, float> > particleParams;
+    CpuGBSAOBCForce obc;
+};
+
+/**
+ * This kernel is invoked by LangevinIntegrator to take one time step.
+ */
+class CpuIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
+public:
+    CpuIntegrateLangevinStepKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : IntegrateLangevinStepKernel(name, platform),
+        data(data), dynamics(NULL) {
+    }
+    ~CpuIntegrateLangevinStepKernel();
+    /**
+     * Initialize the kernel, setting up the particle masses.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the LangevinIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const LangevinIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the LangevinIntegrator this kernel is being used for
+     */
+    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
+    /**
+     * Compute the kinetic energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the LangevinIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const LangevinIntegrator& integrator);
+private:
+    CpuPlatform::PlatformData& data;
+    CpuLangevinDynamics* dynamics;
+    std::vector<RealOpenMM> masses;
+    double prevTemp, prevFriction, prevStepSize;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_CPUKERNELS_H_*/

--- a/platforms/cpu/include/CpuLangevinDynamics.h
+++ b/platforms/cpu/include/CpuLangevinDynamics.h
+
+/* Portions copyright (c) 2013 Stanford University and Simbios.
+ * Authors: Peter Eastman
+ * Contributors: 
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __CPU_LANGEVIN_DYNAMICS_H__
+#define __CPU_LANGEVIN_DYNAMICS_H__
+
+#include "ReferenceStochasticDynamics.h"
+#include "CpuRandom.h"
+#include "openmm/internal/ThreadPool.h"
+#include "sfmt/SFMT.h"
+
+// ---------------------------------------------------------------------------------------
+
+class CpuLangevinDynamics : public ReferenceStochasticDynamics {
+public:
+    class Update1Task;
+    class Update2Task;
+    /**
+     * Constructor.
+     *
+     * @param numberOfAtoms  number of atoms
+     * @param deltaT         delta t for dynamics
+     * @param tau            viscosity
+     * @param temperature    temperature
+     * @param threads        thread pool for parallelizing computation
+     * @param random         random number generator
+     */
+    CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, OpenMM::ThreadPool& threads, OpenMM::CpuRandom& random);
+
+    /**
+     * Destructor.
+     */
+    ~CpuLangevinDynamics();
+
+    /**
+     * First update step.
+     * 
+     * @param numberOfAtoms       number of atoms
+     * @param atomCoordinates     atom coordinates
+     * @param velocities          velocities
+     * @param forces              forces
+     * @param inverseMasses       inverse atom masses
+     * @param xPrime              xPrime
+     */
+    void updatePart1(int numberOfAtoms, std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& velocities,
+                     std::vector<OpenMM::RealVec>& forces, std::vector<RealOpenMM>& inverseMasses, std::vector<OpenMM::RealVec>& xPrime);
+      
+    /**
+     * Second update step.
+     * 
+     * @param numberOfAtoms       number of atoms
+     * @param atomCoordinates     atom coordinates
+     * @param velocities          velocities
+     * @param forces              forces
+     * @param inverseMasses       inverse atom masses
+     * @param xPrime              xPrime
+     */
+    void updatePart2(int numberOfAtoms, std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& velocities,
+                     std::vector<OpenMM::RealVec>& forces, std::vector<RealOpenMM>& inverseMasses, std::vector<OpenMM::RealVec>& xPrime);
+
+private:
+    void threadUpdate1(int threadIndex);
+    void threadUpdate2(int threadIndex);
+    OpenMM::ThreadPool& threads;
+    OpenMM::CpuRandom& random;
+    std::vector<OpenMM_SFMT::SFMT> threadRandom;
+    // The following variables are used to make information accessible to the individual threads.
+    int numberOfAtoms;
+    OpenMM::RealVec* atomCoordinates;
+    OpenMM::RealVec* velocities;
+    OpenMM::RealVec* forces;
+    RealOpenMM* inverseMasses;
+    OpenMM::RealVec* xPrime;
+};
+
+// ---------------------------------------------------------------------------------------
+
+#endif // __CPU_LANGEVIN_DYNAMICS_H__
--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
 #ifndef OPENMM_CPU_NEIGHBORLIST_H_
 #define OPENMM_CPU_NEIGHBORLIST_H_

+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "AlignedArray.h"
 #include "windowsExportCpu.h"
-#include <pthread.h>
+#include "openmm/internal/ThreadPool.h"
 #include <set>
 #include <utility>
 #include <vector>
@@ -11,13 +43,11 @@ namespace OpenMM {
    
 class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
-    class ThreadData;
+    class ThreadTask;
    class Voxels;
-    static const int BlockSize;
-    CpuNeighborList();
-    ~CpuNeighborList();
-    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
-            const float* periodicBoxSize, bool usePeriodic, float maxDistance);
+    CpuNeighborList(int blockSize);
+    void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
+            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    const std::vector<int>& getSortedAtoms() const;
    const std::vector<int>& getBlockNeighbors(int blockIndex) const;
@@ -25,25 +55,13 @@ public:
    /**
     * This routine contains the code executed by each thread.
     */
+    void threadComputeNeighborList(ThreadPool& threads, int threadIndex);
    void runThread(int index);
 private:
-    /**
-     * This is called by the worker threads to wait until the master thread instructs them to advance.
-     */
-    void threadWait();
-    /**
-     * This is called by the master thread to instruct all the worker threads to advance.
-     */
-    void advanceThreads();
-    bool isDeleted;
-    int numThreads, waitCount;
+    int blockSize;
    std::vector<int> sortedAtoms;
    std::vector<std::vector<int> > blockNeighbors;
    std::vector<std::vector<char> > blockExclusions;
-    std::vector<pthread_t> thread;
-    std::vector<ThreadData*> threadData;
-    pthread_cond_t startCondition, endCondition;
-    pthread_mutex_t lock;
    // The following variables are used to make information accessible to the individual threads.
    float minx, maxx, miny, maxy, minz, maxz;
    std::vector<std::pair<int, int> > atomBins;
@@ -58,4 +76,4 @@ private:

 } // namespace OpenMM

-#endif // OPENMM_REFERENCE_NEIGHBORLIST_H_
+#endif // OPENMM_CPU_NEIGHBORLIST_H_
--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -25,10 +25,11 @@
 #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
 #define OPENMM_CPU_NONBONDED_FORCE_H__

+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
+#include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
-#include <pthread.h>
 #include <set>
 #include <utility>
 #include <vector>
@@ -38,7 +39,7 @@ namespace OpenMM {

 class CpuNonbondedForce {
    public:
-        class ThreadData;
+        class ComputeDirectTask;

      /**---------------------------------------------------------------------------------------
      
@@ -47,15 +48,13 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */

       CpuNonbondedForce();
+       
+        /**
+         * Virtual destructor.
+         */

-      /**---------------------------------------------------------------------------------------
-      
-         Destructor
-      
-         --------------------------------------------------------------------------------------- */
-
-       ~CpuNonbondedForce();
-
+        virtual ~CpuNonbondedForce();
+        
      /**---------------------------------------------------------------------------------------
      
         Set the force to use a cutoff.
@@ -130,9 +129,9 @@ class CpuNonbondedForce {
            
         --------------------------------------------------------------------------------------- */
          
-      void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<RealVec>& atomCoordinates,
+      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
-                            std::vector<RealVec>& forces, float* totalEnergy) const;
+                            std::vector<RealVec>& forces, double* totalEnergy) const;
      
      /**---------------------------------------------------------------------------------------
      
@@ -140,28 +139,31 @@ class CpuNonbondedForce {
      
         @param numberOfAtoms    number of atoms
         @param posq             atom coordinates and charges
+         @param atomCoordinates  atom coordinates (periodic boundary conditions not applied)
         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
         @param exclusions       atom exclusion indices
                                 exclusions[atomIndex] contains the list of exclusions for that atom
         @param forces           force array (forces added)
         @param totalEnergy      total energy
+         @param threads          the thread pool to use
      
         --------------------------------------------------------------------------------------- */
          
-      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, float* forces, float* totalEnergy);
+      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
+            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
     */
-    void runThread(int index, std::vector<float>& threadForce, double& threadEnergy);
+    void threadComputeDirect(ThreadPool& threads, int threadIndex);

-private:
+protected:
        bool cutoff;
        bool useSwitch;
        bool periodic;
        bool ewald;
        bool pme;
+        bool tableIsValid;
        const CpuNeighborList* neighborList;
        float periodicBoxSize[3];
        float cutoffDistance, switchingDistance;
@@ -171,18 +173,16 @@ private:
        int meshDim[3];
        std::vector<float> ewaldScaleTable;
        float ewaldDX, ewaldDXInv;
-        bool isDeleted;
-        int numThreads, waitCount;
-        std::vector<pthread_t> thread;
-        std::vector<ThreadData*> threadData;
-        pthread_cond_t startCondition, endCondition;
-        pthread_mutex_t lock;
+        std::vector<double> threadEnergy;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
        float* posq;
+        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
        std::set<int> const* exclusions;
+        std::vector<AlignedArray<float> >* threadForce;
        bool includeEnergy;
+        void* atomicCounter;

        static const float TWO_OVER_SQRT_PI;
        static const int NUM_TABLE_POINTS;
@@ -210,7 +210,7 @@ private:
            
         --------------------------------------------------------------------------------------- */
          
-      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      virtual void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;
            
      /**---------------------------------------------------------------------------------------
      
@@ -222,7 +222,7 @@ private:
            
         --------------------------------------------------------------------------------------- */
          
-      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      virtual void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;

      /**
       * Compute the displacement and squared distance between two points, optionally using
@@ -230,20 +230,15 @@ private:
       */
      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

-      /**
-       * Compute a fast approximation to erfc(x).
-       */
-      static fvec4 erfcApprox(fvec4 x);
-
      /**
       * Create a lookup table for the scale factor used with Ewald and PME.
       */
      void tabulateEwaldScaleFactor();
-      
+
      /**
-       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       * Compute a fast approximation to erfc(x).
       */
-      fvec4 ewaldScaleFunction(fvec4 x);
+      static float erfcApprox(float x);
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
+#define OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
+
+#include "CpuNonbondedForce.h"
+// ---------------------------------------------------------------------------------------
+
+namespace OpenMM {
+
+class CpuNonbondedForceVec4 : public CpuNonbondedForce {
+public:
+      /**---------------------------------------------------------------------------------------
+      
+         Constructor
+      
+         --------------------------------------------------------------------------------------- */
+
+       CpuNonbondedForceVec4();
+
+protected:
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+            
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+
+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+
+      /**
+       * Compute a fast approximation to erfc(x).
+       */
+      static fvec4 erfcApprox(const fvec4& x);
+      
+      /**
+       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       */
+      fvec4 ewaldScaleFunction(const fvec4& x);
+};
+
+} // namespace OpenMM
+
+// ---------------------------------------------------------------------------------------
+
+#endif // OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
+#define OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
+
+#include "CpuNonbondedForce.h"
+
+#ifdef __AVX__
+
+#include "openmm/internal/vectorize8.h"
+
+// ---------------------------------------------------------------------------------------
+
+namespace OpenMM {
+
+class CpuNonbondedForceVec8 : public CpuNonbondedForce {
+public:
+       CpuNonbondedForceVec8();
+
+protected:            
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+            
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+
+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+
+      /**
+       * Compute a fast approximation to erfc(x).
+       */
+      static fvec8 erfcApprox(const fvec8& x);
+      
+      /**
+       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       */
+      fvec8 ewaldScaleFunction(const fvec8& x);
+};
+
+} // namespace OpenMM
+
+// ---------------------------------------------------------------------------------------
+
+#endif // __AVX__
+
+#endif // OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
--- a/platforms/cpu/include/CpuPlatform.h
+++ b/platforms/cpu/include/CpuPlatform.h
@@ -32,8 +32,13 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#include "AlignedArray.h"
+#include "CpuRandom.h"
 #include "ReferencePlatform.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/ThreadPool.h"
 #include "windowsExportCpu.h"
+#include <map>

 namespace OpenMM {
    
@@ -43,6 +48,7 @@ namespace OpenMM {

 class OPENMM_EXPORT_CPU CpuPlatform : public ReferencePlatform {
 public:
+    class PlatformData;
    CpuPlatform();
    const std::string& getName() const {
        static const std::string name = "CPU";
@@ -51,6 +57,25 @@ public:
    double getSpeed() const;
    bool supportsDoublePrecision() const;
    static bool isProcessorSupported();
+    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
+    void contextDestroyed(ContextImpl& context) const;
+    /**
+     * We cannot use the standard mechanism for platform data, because that is already used by the superclass.
+     * Instead, we maintain a table of ContextImpls to PlatformDatas.
+     */
+    static PlatformData& getPlatformData(ContextImpl& context);
+private:
+    static std::map<ContextImpl*, PlatformData*> contextData;
+};
+
+class CpuPlatform::PlatformData {
+public:
+    PlatformData(int numParticles);
+    AlignedArray<float> posq;
+    std::vector<AlignedArray<float> > threadForce;
+    ThreadPool threads;
+    bool isPeriodic;
+    CpuRandom random;
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuRandom.h
+++ b/platforms/cpu/include/CpuRandom.h
+#ifndef OPENMM_CPURANDOM_H_
+#define OPENMM_CPURANDOM_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "sfmt/SFMT.h"
+#include "windowsExportCpu.h"
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This class provides a multithreaded random number generator.
+ */
+class OPENMM_EXPORT_CPU CpuRandom {
+public:
+    CpuRandom();
+    ~CpuRandom();
+    void initialize(int seed, int numThreads);
+    float getGaussianRandom(int threadIndex);
+    float getUniformRandom(int threadIndex);
+private:
+    bool hasInitialized;
+    int randomSeed;
+    std::vector<OpenMM_SFMT::SFMT*> threadRandom;
+    std::vector<float> nextGaussian;
+    std::vector<int> nextGaussianIsValid;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPURANDOM_H_*/
--- a/platforms/cpu/include/CpuSETTLE.h
+++ b/platforms/cpu/include/CpuSETTLE.h
+#ifndef OPENMM_CPUSETTLE_H_
+#define OPENMM_CPUSETTLE_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "ReferenceSETTLEAlgorithm.h"
+#include "windowsExportCpu.h"
+#include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This class uses multiple ReferenceSETTLEAlgorithm objects to execute the algorithm in parallel.
+ */
+class OPENMM_EXPORT_CPU CpuSETTLE : public ReferenceConstraintAlgorithm {
+public:
+    class ApplyToPositionsTask;
+    class ApplyToVelocitiesTask;
+    CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads);
+    ~CpuSETTLE();
+
+    /**
+     * Apply the constraint algorithm.
+     * 
+     * @param atomCoordinates  the original atom coordinates
+     * @param atomCoordinatesP the new atom coordinates
+     * @param inverseMasses    1/mass
+     * @param tolerance        the constraint tolerance
+     */
+    void apply(std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& atomCoordinatesP, std::vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance);
+
+    /**
+     * Apply the constraint algorithm to velocities.
+     * 
+     * @param atomCoordinates  the atom coordinates
+     * @param atomCoordinatesP the velocities to modify
+     * @param inverseMasses    1/mass
+     * @param tolerance        the constraint tolerance
+     */
+    void applyToVelocities(std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& velocities, std::vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance);
+private:
+    std::vector<ReferenceSETTLEAlgorithm*> threadSettle;
+    ThreadPool& threads;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPUSETTLE_H_*/
--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
-SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1")
+FOREACH(file ${SOURCE_FILES})
+    IF (file MATCHES ".*Vec8.*")
+		IF (MSVC)
+            SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
+		ELSE (MSVC)
+            SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -mavx")
+		ENDIF (MSVC)
+    ELSE (file MATCHES ".*Vec8.*")
+		IF (NOT MSVC)
+            SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
+		ENDIF (NOT MSVC)
+    ENDIF (file MATCHES ".*Vec8.*")
+ENDFOREACH(file)
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
@@ -7,6 +19,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_CPU_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})