Conflict resolution in TestSplineFilter.cpp

0b5d58d7 · Charlles Abreu · 9026dbe7 · b0d13582 · 0b5d58d7 · 0b5d58d7
Commit 0b5d58d7 authored May 27, 2020 by Charlles Abreu
20 changed files
--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -32,7 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#if defined(__ARM__)
+#if defined(__ARM__) || defined(__ARM64__)
    #include "vectorize_neon.h"
 #elif defined(__PPC__)
    #include "vectorize_ppc.h"

--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
@@ -46,11 +46,23 @@ class fvec8 {
 public:
    __m256 val;

-    fvec8() {}
+    fvec8() = default;
    fvec8(float v) : val(_mm256_set1_ps(v)) {}
    fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
    fvec8(__m256 v) : val(v) {}
    fvec8(const float* v) : val(_mm256_loadu_ps(v)) {}
+
+    /** Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+     * be loaded from table[idx[i]].
+     * @param table The table from which to do a lookup.
+     * @param indexes The indexes to gather.
+     */
+    fvec8(const float* table, const int idx[8]) {
+        // :TODO: Using int32_t explicitly as the index type could allow the real gather instruction to be used.
+        // Use gather and static assert? Conditional code?
+        val = _mm256_setr_ps(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]], table[idx[4]], table[idx[5]], table[idx[6]], table[idx[7]]);
+    }
+
    operator __m256() const {
        return val;
    }
@@ -115,6 +127,12 @@ public:
        return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
    }
    operator ivec8() const;
+
+    /**
+     * Convert an integer bitmask into a full vector of elements which can be used
+     * by the blend function.
+     */
+    static fvec8 expandBitsToMask(int bitmask);
 };

 /**
@@ -160,6 +178,19 @@ inline ivec8::operator fvec8() const {
    return _mm256_cvtepi32_ps(val);
 }

+inline fvec8 fvec8::expandBitsToMask(int bitmask) {
+    // Put a copy of bit 0 in the first element, bit 1 in the second, and so on.
+    const auto expandedBits =
+      _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi8(bitmask)),
+                    _mm256_castsi256_ps(_mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128)));
+
+    // The individual bits are essentially extremely small floating-point values. By comparing against zero
+    //  (even a floating-point zero), the individual bits are turned into a complete element mask.
+    const auto elementMask = _mm256_cmp_ps(expandedBits, __m256(), _CMP_NEQ_OQ);
+
+    return elementMask;
+}
+
 // Functions that operate on fvec8s.

 static inline fvec8 floor(const fvec8& v) {
@@ -208,6 +239,11 @@ static inline float dot8(const fvec8& v1, const fvec8& v2) {
    return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
 }

+static inline float reduceAdd(const fvec8 v) {
+    // :TODO: There are more efficient ways to do this.
+    return dot8(v, fvec8(1.0f));
+}
+
 static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
    fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
    fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
@@ -231,6 +267,14 @@ static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in
    out4 = _mm256_insertf128_ps(out4, i8, 1);
 }

+/** Given a vec4[8] input array, generate 4 vec8 outputs. The first output contains all the first elements
+ * the second output the second elements, and so on. Note that the prototype is essentially differing only
+ * in output type so it can be overloaded in other SIMD fvec types.
+ */
+static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
+    transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4);
+}
+
 static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
    out1 = in1.lowerVec();
    out2 = in2.lowerVec();
@@ -244,6 +288,13 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
    _MM_TRANSPOSE4_PS(out5, out6, out7, out8);
 }

+/**
+ * Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
+ */
+static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4 out[8]) {
+    transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+}
+
 // Functions that operate on ivec8s.

 static inline bool any(const ivec8& v) {
@@ -268,10 +319,100 @@ static inline fvec8 operator/(float v1, const fvec8& v2) {
    return fvec8(v1)/v2;
 }

-// Operations for blending fvec8s based on an ivec8.
+// Operation for blending fvec8 from a full bitmask.
+static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const fvec8& mask) {
+    return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val));
+}
+
+static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
+    return blend(0.0f, v, mask);
+}

-static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const ivec8& mask) {
-    return fvec8(_mm256_blendv_ps(v1.val, v2.val, _mm256_castsi256_ps(mask.val)));
+/**
+ * Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& out0, fvec8& out1) {
+
+    const auto lower = index.lowerVec();
+    const auto upper = index.upperVec();
+
+    // Gather all the separate memory data together. Each vector will have two values
+    // which get used, and two which are ultimately discarded.
+    fvec4 t0(table + lower[0]);
+    fvec4 t1(table + lower[1]);
+    fvec4 t2(table + lower[2]);
+    fvec4 t3(table + lower[3]);
+    fvec4 t4(table + upper[0]);
+    fvec4 t5(table + upper[1]);
+    fvec4 t6(table + upper[2]);
+    fvec4 t7(table + upper[3]);
+
+    // Tranposing the 8 vectors above will put all the first elements into one output
+    // vector, all the second elements into the next vector and so on.
+    fvec8 discard0, discard1;
+    transpose(t0, t1, t2, t3, t4, t5, t6, t7, out0, out1, discard0, discard1);
+}
+
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3 X4 X5 X6 X7
+ *   Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
+ *   Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + ...)
+ *   output[1] = (Y0 + Y1 + Y2 + ...)
+ *   output[2] = (Z0 + Z1 + Z2 + ...)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) {
+    // The general strategy for a vector reduce-add operation is to take values from
+    // different parts of the vector and overlap them a different part of the vector and then
+    // add together. Repeat this several times until all values have been summed. Initially 8
+    // values can be reduced to 4, 4 to 2, and 2 to 1. The following code essentially does this
+    // but exploits two things:
+    //   - having multiple inputs means that some vectors can be combined together to amortise the
+    //     cost of shuffling.
+    //   - the output destinations are part of anther vector, so accumulate into the correct
+    //     offsets to start with, instead of reducing to position 0 and re-inserting to the correct
+    //     output location.
+    //
+    // As far as possible, accumulate x, y and z into their output positions in both the top and
+    // bottom 128-bits to exploit in-lane permutes as much as possible early on.
+
+    // Shuffle X and Z together to form one reduced vector.
+    //   X2 X3 Z0 Z1 X6 X7 Z4 Z5
+    const auto xzshuf = _mm256_shuffle_ps(x, z, 0b01001110);
+    // Blend X and Z together to form another reduced vector, overlapping the previous.
+    //   X0 X1 Z2 Z3 X4 X5 Z6 Z7
+    const auto xzblend = _mm256_blend_ps(x, z, 0b11001100);
+    // Add them together to form:
+    // (X0 + X2) (X1 + X3) (Z0 + Z2) (Z1 + Z3) etc.
+    const auto xz0 = _mm256_add_ps(xzshuf, xzblend);
+
+    // Now there's only one vector containing all values. Shuffle again to form another overlap,
+    // and then add.
+    const auto xz1 = _mm256_permute_ps(xz0, 0b00110001);
+    const auto xz2 = _mm256_add_ps(xz0, xz1);
+
+    // Work on Z on its own as there's nothing else to work with. Start by permuting it to
+    // form some overlaps, and then add:
+    //   (Y0 + Y2) (Y1 + Y3) - - (Y4 + Y6) (Y5 + Y7) - -
+    const auto yshuf = _mm256_permute_ps(y, 0b11101110);
+    const auto y0 = _mm256_add_ps(yshuf, y);
+
+    // Shift the bottom float of each pair to the right, into the correct Y location.
+    const auto y1 = _mm256_permute_ps(y0, 0b00000000);
+    const auto y2 = _mm256_add_ps(y0, y1);
+
+    // Blend the results together to give a complete set of XYZ in the correct respective positions
+    // of both top and bottom 128-bit lanes.
+    const auto laneResult = fvec8(_mm256_blend_ps(xz2, y2, 0b00100010));
+
+    return laneResult.lowerVec() + laneResult.upperVec();
 }

 #endif /*OPENMM_VECTORIZE8_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -32,7 +32,12 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#ifdef __ANDROID__
 #include <cpu-features.h>
+#else
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
 #include <arm_neon.h>
 #include <cmath>

@@ -48,8 +53,16 @@ float32x4_t log_ps(float32x4_t);
 * Determine whether ivec4 and fvec4 are supported on this processor.
 */
 static bool isVec4Supported() {
+#ifdef __ANDROID__
    uint64_t features = android_getCpuFeatures();
    return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
+#elif defined(__ARM__)
+    unsigned long features = getauxval(AT_HWCAP);
+    return (features & HWCAP_NEON) != 0;
+#else
+    unsigned long features = getauxval(AT_HWCAP);
+    return (features & HWCAP_ASIMD) != 0;
+#endif
 }

 class ivec4;
@@ -61,7 +74,7 @@ class fvec4 {
 public:
    float32x4_t val;

-    fvec4() {}
+    fvec4() = default;
    fvec4(float v) : val(vdupq_n_f32(v)) {}
    fvec4(float v1, float v2, float v3, float v4) {
        float v[] = {v1, v2, v3, v4};
@@ -72,6 +85,16 @@ public:
    operator float32x4_t() const {
        return val;
    }
+
+    /**
+      * Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+      * be loaded from table[idx[i]].
+      * @param table The table from which to do a lookup.
+      * @param indexes The indexes to gather.
+      */
+    fvec4(const float* table, const int idx[4])
+        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
+
    float operator[](int i) const {
        switch (i) {
            case 0:
@@ -88,6 +111,16 @@ public:
    void store(float* v) const {
        vst1q_f32(v, val);
    }
+
+    /**
+     * Store only the lower three elements of the vector.
+     */
+    void storeVec3(float* v) const {
+        v[0] = vgetq_lane_f32(val, 0);
+        v[1] = vgetq_lane_f32(val, 1);
+        v[2] = vgetq_lane_f32(val, 2);
+    }
+
    fvec4 operator+(const fvec4& other) const {
        return vaddq_f32(val, other);
    }
@@ -146,6 +179,13 @@ public:
        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
    }
    operator ivec4() const;
+
+    /**
+     * Convert an integer bitmask into a full vector of elements which can be used
+     * by the blend function.
+     */
+    static ivec4 expandBitsToMask(int bitmask);
+
 };

 /**
@@ -241,6 +281,12 @@ inline ivec4::operator fvec4() const {
    return fvec4(vcvtq_f32_s32(val));
 }

+inline ivec4 fvec4::expandBitsToMask(int bitmask) {
+    return ivec4(bitmask & 1 ? -1 : 0,
+                 bitmask & 2 ? -1 : 0,
+                 bitmask & 4 ? -1 : 0,
+                 bitmask & 8 ? -1 : 0);
+}
 // Functions that operate on fvec4s.

 static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
@@ -284,6 +330,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
 }

+static inline float reduceAdd(const fvec4 v) {
+    return dot4(v, fvec4(1.0f));
+}
+
 static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
    return fvec4(v1[1]*v2[2] - v1[2]*v2[1],
                 v1[2]*v2[0] - v1[0]*v2[2],
@@ -301,6 +351,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
    v4 = t4.val[1];
 }

+/**
+ * Out-of-place transpose from an array into named variables.
+ */
+static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
+    v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
+    transpose(v0, v1, v2, v3);
+}
+
+/**
+ * Out-of-place transpose from named variables into an array.
+ */
+static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
+    transpose(out[0], out[1], out[2], out[3]);
+}
+
 // Functions that operate on ivec4s.

 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
@@ -343,6 +409,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
    return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
 }

+static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+    return blend(0.0f, v, mask);
+}
+
 // These are at the end since they involve other functions defined above.

 static inline fvec4 round(const fvec4& v) {
@@ -361,4 +431,38 @@ static inline fvec4 ceil(const fvec4& v) {
    return rounded + blend(0.0f, 1.0f, rounded<v);
 }

+/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+    fvec4 t0(table + index[0]);
+    fvec4 t1(table + index[1]);
+    fvec4 t2(table + index[2]);
+    fvec4 t3(table + index[3]);
+    transpose(t0, t1, t2, t3);
+    out0 = t0;
+    out1 = t1;
+}
+
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3
+ *   Y0 Y1 Y2 Y3
+ *   Z0 Z1 Z2 Z3
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + X3)
+ *   output[1] = (Y0 + Y1 + Y2 + Y3)
+ *   output[2] = (Z0 + Z1 + Z2 + Z3)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+    const auto nx = reduceAdd(x);
+    const auto ny = reduceAdd(y);
+    const auto nz = reduceAdd(z);
+    return fvec4(nx, ny, nz, 0.0);
+}
+
 #endif /*OPENMM_VECTORIZE_NEON_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -56,7 +56,7 @@ class fvec4 {
 public:
    __m128 val;
    
-    fvec4() {}
+    fvec4() = default;
    fvec4(float v) {
        val = {v, v, v, v};
    }
@@ -67,6 +67,16 @@ public:
    fvec4(const float* v) {
        val = *((__m128*) v);
    }
+
+    /**
+     * Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+     * be loaded from table[idx[i]].
+     * @param table The table from which to do a lookup.
+     * @param indexes The indexes to gather.
+     */
+    fvec4(const float* table, const int idx[4])
+        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
+
    operator __m128() const {
        return val;
    }
@@ -76,6 +86,15 @@ public:
    void store(float* v) const {
        *((__m128*) v) = val;
    }
+
+    /**
+     * Store only the lower three elements of the vector.
+     */
+    void storeVec3(float* v) const {
+        v[0] = val[0];
+        v[1] = val[1];
+        v[2] = val[2];
+    }
    fvec4 operator+(const fvec4& other) const {
        return val+other;
    }
@@ -116,6 +135,13 @@ public:
    ivec4 operator>=(const fvec4& other) const;
    ivec4 operator<=(const fvec4& other) const;
    operator ivec4() const;
+
+    /**
+     * Convert an integer bitmask into a full vector of elements which can be used
+     * by the blend function.
+     */
+    static ivec4 expandBitsToMask(int bitmask);
+
 };

 /**
@@ -227,6 +253,13 @@ inline ivec4::operator fvec4() const {
    return __builtin_convertvector(val, __m128);
 }

+inline ivec4 fvec4::expandBitsToMask(int bitmask) {
+    return ivec4(bitmask & 1 ? -1 : 0,
+                 bitmask & 2 ? -1 : 0,
+                 bitmask & 4 ? -1 : 0,
+                 bitmask & 8 ? -1 : 0);
+}
+
 // Functions that operate on fvec4s.

 static inline fvec4 abs(const fvec4& v) {
@@ -252,6 +285,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return temp[0]+temp[1];
 }

+static inline float reduceAdd(const fvec4 v) {
+    return dot4(v, fvec4(1.0f));
+}
+
 static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
    __m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) -
                  v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3);
@@ -269,6 +306,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
    v4 = __builtin_shufflevector(a2, a4, 2, 3, 6, 7);
 }

+/**
+ * Out-of-place transpose from an array into named variables.
+ */
+static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
+    v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
+    transpose(v0, v1, v2, v3);
+}
+
+/**
+ * Out-of-place transpose from named variables into an array.
+ */
+static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
+    transpose(out[0], out[1], out[2], out[3]);
+}
+
 // Functions that operate on ivec4s.

 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
@@ -312,6 +365,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask)
    return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1));
 }

+static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+    return blend(0.0f, v, mask);
+}
+
 // These are at the end since they involve other functions defined above.

 static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
@@ -358,5 +415,40 @@ static inline fvec4 sqrt(const fvec4& v) {
    return rsqrt(v)*v;
 }

+/**
+ * Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+    fvec4 t0(table + index[0]);
+    fvec4 t1(table + index[1]);
+    fvec4 t2(table + index[2]);
+    fvec4 t3(table + index[3]);
+    transpose(t0, t1, t2, t3);
+    out0 = t0;
+    out1 = t1;
+}
+
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3
+ *   Y0 Y1 Y2 Y3
+ *   Z0 Z1 Z2 Z3
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + X3)
+ *   output[1] = (Y0 + Y1 + Y2 + Y3)
+ *   output[2] = (Z0 + Z1 + Z2 + Z3)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+    const auto nx = reduceAdd(x);
+    const auto ny = reduceAdd(y);
+    const auto nz = reduceAdd(z);
+    return fvec4(nx, ny, nz, 0.0);
+}
+
 #endif /*OPENMM_VECTORIZE_PNACL_H_*/

--- a/openmmapi/include/openmm/internal/vectorize_ppc.h
+++ b/openmmapi/include/openmm/internal/vectorize_ppc.h
@@ -57,7 +57,7 @@ class fvec4 {
 public:
    __m128 val;
    
-    fvec4() {}
+    fvec4() = default;
    fvec4(float v) {
        val = (__m128) {v, v, v, v};
    }
@@ -68,6 +68,16 @@ public:
    fvec4(const float* v) {
        val = *((__m128*) v);
    }
+
+    /**
+      * Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+      * be loaded from table[idx[i]].
+      * @param table The table from which to do a lookup.
+      * @param indexes The indexes to gather.
+      */
+    fvec4(const float* table, const int idx[4])
+        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
+
    operator __m128() const {
        return val;
    }
@@ -77,6 +87,16 @@ public:
    void store(float* v) const {
        *((__m128*) v) = val;
    }
+
+    /**
+      * Store only the lower three elements of the vector.
+      */
+    void storeVec3(float* v) const {
+        v[0] = val[0];
+        v[1] = val[1];
+        v[2] = val[2];
+    }
+
    fvec4 operator+(const fvec4& other) const {
        return vec_add(val, other.val);
    }
@@ -117,6 +137,13 @@ public:
    ivec4 operator>=(const fvec4& other) const;
    ivec4 operator<=(const fvec4& other) const;
    operator ivec4() const;
+
+    /***
+      * Convert an integer bitmask into a full vector of elements which can be used
+      * by the blend function.
+      */
+    static ivec4 expandBitsToMask(int bitmask);
+
 };

 /**
@@ -228,6 +255,13 @@ inline ivec4::operator fvec4() const {
    return (__m128) {(float)val[0], (float)val[1], (float)val[2], (float)val[3]};
 }

+inline ivec4 fvec4::expandBitsToMask(int bitmask) {
+    return ivec4(bitmask & 1 ? -1 : 0,
+                 bitmask & 2 ? -1 : 0,
+                 bitmask & 4 ? -1 : 0,
+                 bitmask & 8 ? -1 : 0);
+}
+
 // Functions that operate on fvec4s.

 static inline fvec4 abs(const fvec4& v) {
@@ -253,6 +287,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return temp[0]+temp[1];
 }

+static inline float reduceAdd(const fvec4 v) {
+    return dot4(v, fvec4(1.0f));
+}
+
 static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
    vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15};
    __m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) -
@@ -275,6 +313,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
    v4 = vec_perm(a2, a4, perm4);
 }

+/**
+ * Out-of-place transpose from an array into named variables.
+ */
+static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
+    v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
+    transpose(v0, v1, v2, v3);
+}
+
+/**
+ * Out-of-place transpose from named variables into an array.
+ */
+static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
+    transpose(out[0], out[1], out[2], out[3]);
+}
+
 // Functions that operate on ivec4s.

 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
@@ -289,8 +343,8 @@ static inline ivec4 abs(const ivec4& v) {
    return vec_abs(v.val);
 }

-static inline bool any(const __m128i& v) {
-    return !vec_all_eq(v, ivec4(0).val);
+static inline bool any(const ivec4 v) {
+    return !vec_all_eq(v.val, ivec4(0).val);
 }

 // Mathematical operators involving a scalar and a vector.
@@ -317,6 +371,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask)
    return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val);
 }

+static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+    return blend(0.0f, v, mask);
+}
+
 // These are at the end since they involve other functions defined above.

 static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
@@ -355,5 +413,39 @@ static inline fvec4 sqrt(const fvec4& v) {
    return vec_sqrt(v.val);
 }

+/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+    fvec4 t0(table + index[0]);
+    fvec4 t1(table + index[1]);
+    fvec4 t2(table + index[2]);
+    fvec4 t3(table + index[3]);
+    transpose(t0, t1, t2, t3);
+    out0 = t0;
+    out1 = t1;
+}
+
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3
+ *   Y0 Y1 Y2 Y3
+ *   Z0 Z1 Z2 Z3
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + X3)
+ *   output[1] = (Y0 + Y1 + Y2 + Y3)
+ *   output[2] = (Z0 + Z1 + Z2 + Z3)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+    const auto nx = reduceAdd(x);
+    const auto ny = reduceAdd(y);
+    const auto nz = reduceAdd(z);
+    return fvec4(nx, ny, nz, 0.0);
+}
+
 #endif /*OPENMM_VECTORIZE_PPC_H_*/

--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -32,7 +32,12 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#ifdef __AVX__
+#include <immintrin.h>
+#else
 #include <smmintrin.h>
+#endif
+
 #include "hardware.h"

 // This file defines classes and functions to simplify vectorizing code with SSE.
@@ -63,11 +68,21 @@ class fvec4 {
 public:
    __m128 val;
    
-    fvec4() {}
+    fvec4() = default;
    fvec4(float v) : val(_mm_set1_ps(v)) {}
    fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
    fvec4(__m128 v) : val(v) {}
    fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
+
+    /**
+     * Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+     * be loaded from table[idx[i]].
+     * @param table The table from which to do a lookup.
+     * @param indexes The indexes to gather.
+     */
+    fvec4(const float* table, const int idx[4])
+        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
+
    operator __m128() const {
        return val;
    }
@@ -79,6 +94,20 @@ public:
    void store(float* v) const {
        _mm_storeu_ps(v, val);
    }
+
+    /**
+     * Store only the lower three elements of the vector.
+     */
+    void storeVec3(float* v) const {
+        // This code could be called from objects compiled for better SIMD domains (e.g., AVX) so conditionally
+        // compile in the most efficient variant of the instruction.
+#ifdef  __AVX__
+        _mm_maskstore_ps(v, _mm_setr_epi32(-1, -1, -1, 0), val);
+#else
+        _mm_maskmoveu_si128 (_mm_castps_si128(val), _mm_setr_epi32(-1, -1, -1, 0), (char*)v);
+#endif
+    }
+
    fvec4 operator+(const fvec4& other) const {
        return _mm_add_ps(val, other);
    }
@@ -131,6 +160,12 @@ public:
        return _mm_cmple_ps(val, other);
    }
    operator ivec4() const;
+
+    /**
+     * Convert an integer bitmask into a full vector of elements which can be used
+     * by the blend function.
+     */
+    static fvec4 expandBitsToMask(int bitmask);
 };

 /**
@@ -214,6 +249,13 @@ inline ivec4::operator fvec4() const {
    return _mm_cvtepi32_ps(val);
 }

+inline fvec4 fvec4::expandBitsToMask(int bitmask) {
+    // Not optimal for SSE (see AVX implementation for better version)
+    // but useful as an example for other SIMD architectures.
+    const auto values = fvec4(bitmask & 1, bitmask & 2, bitmask & 4, bitmask & 8);
+    return values != fvec4(0.0f);
+}
+
 // Functions that operate on fvec4s.

 static inline fvec4 floor(const fvec4& v) {
@@ -273,6 +315,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
 }

+static inline float reduceAdd(const fvec4 v) {
+    return dot4(v, fvec4(1.0f));
+}
+
 static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
    fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) -
                 fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1))));
@@ -283,6 +329,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
    _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
 }

+/**
+ * Out-of-place transpose from an array into named variables.
+ */
+static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
+    v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
+    transpose(v0, v1, v2, v3);
+}
+
+/**
+ * Out-of-place transpose from named variables into an array.
+ */
+static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
+    transpose(out[0], out[1], out[2], out[3]);
+}
+
 // Functions that operate on ivec4s.

 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
@@ -319,10 +381,48 @@ static inline fvec4 operator/(float v1, const fvec4& v2) {
    return fvec4(v1)/v2;
 }

-// Operations for blending fvec4s based on an ivec4.
+// Operations for blending fvec4
+static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const fvec4& mask) {
+    return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val));
+}

-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
-    return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
+static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
+    return blend(0.0f, v, mask);
+}
+
+/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+    fvec4 t0(table + index[0]);
+    fvec4 t1(table + index[1]);
+    fvec4 t2(table + index[2]);
+    fvec4 t3(table + index[3]);
+    transpose(t0, t1, t2, t3);
+    out0 = t0;
+    out1 = t1;
+}
+
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3
+ *   Y0 Y1 Y2 Y3
+ *   Z0 Z1 Z2 Z3
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + X3)
+ *   output[1] = (Y0 + Y1 + Y2 + Y3)
+ *   output[2] = (Z0 + Z1 + Z2 + Z3)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+    // :TODO: Could be made more efficient.
+    const auto nx = reduceAdd(x);
+    const auto ny = reduceAdd(y);
+    const auto nz = reduceAdd(z);
+    return fvec4(nx, ny, nz, 0.0);
 }

 #endif /*OPENMM_VECTORIZE_SSE_H_*/

--- a/openmmapi/src/CompoundIntegrator.cpp
+++ b/openmmapi/src/CompoundIntegrator.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2015-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2015-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -128,3 +128,15 @@ void CompoundIntegrator::stateChanged(State::DataType changed) {
 double CompoundIntegrator::computeKineticEnergy() {
    return integrators[currentIntegrator]->computeKineticEnergy();
 }
+
+void CompoundIntegrator::createCheckpoint(std::ostream& stream) const {
+    stream.write((char*) &currentIntegrator, sizeof(int));
+    for (int i = 0; i < integrators.size(); i++)
+        integrators[i]->createCheckpoint(stream);
+}
+
+void CompoundIntegrator::loadCheckpoint(std::istream& stream) {
+    stream.read((char*) &currentIntegrator, sizeof(int));
+    for (int i = 0; i < integrators.size(); i++)
+        integrators[i]->loadCheckpoint(stream);
+}
--- a/openmmapi/src/ContextImpl.cpp
+++ b/openmmapi/src/ContextImpl.cpp
@@ -454,6 +454,7 @@ void ContextImpl::createCheckpoint(ostream& stream) {
        stream.write((char*) &param.second, sizeof(double));
    }
    updateStateDataKernel.getAs<UpdateStateDataKernel>().createCheckpoint(*this, stream);
+    integrator.createCheckpoint(stream);
    stream.flush();
 }

@@ -480,6 +481,7 @@ void ContextImpl::loadCheckpoint(istream& stream) {
        parameters[name] = value;
    }
    updateStateDataKernel.getAs<UpdateStateDataKernel>().loadCheckpoint(*this, stream);
+    integrator.loadCheckpoint(stream);
    hasSetPositions = true;
    integrator.stateChanged(State::Positions);
    integrator.stateChanged(State::Velocities);

--- a/openmmapi/src/CustomIntegrator.cpp
+++ b/openmmapi/src/CustomIntegrator.cpp
@@ -114,6 +114,31 @@ bool CustomIntegrator::kineticEnergyRequiresForce() const {
    return keNeedsForce;
 }

+void CustomIntegrator::createCheckpoint(std::ostream& stream) const {
+    for (int i = 0; i < getNumGlobalVariables(); i++) {
+        double value = getGlobalVariable(i);
+        stream.write((char*) &value, sizeof(double));
+    }
+    vector<Vec3> values;
+    for (int i = 0; i < getNumPerDofVariables(); i++) {
+        getPerDofVariable(i, values);
+        stream.write((char*) values.data(), sizeof(Vec3)*values.size());
+    }
+}
+
+void CustomIntegrator::loadCheckpoint(std::istream& stream) {
+    double value;
+    for (int i = 0; i < getNumGlobalVariables(); i++) {
+        stream.read((char*) &value, sizeof(double));
+        setGlobalVariable(i, value);
+    }
+    vector<Vec3> values(context->getSystem().getNumParticles());
+    for (int i = 0; i < getNumPerDofVariables(); i++) {
+        stream.read((char*) values.data(), sizeof(Vec3)*values.size());
+        setPerDofVariable(i, values);
+    }
+}
+
 void CustomIntegrator::step(int steps) {
    if (context == NULL)
        throw OpenMMException("This Integrator is not bound to a context!");  

--- a/openmmapi/src/NoseHooverChain.cpp
+++ b/openmmapi/src/NoseHooverChain.cpp
@@ -55,7 +55,10 @@ std::vector<double> NoseHooverChain::getYoshidaSuzukiWeights() const {
        case 5:
            return {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065,
                    0.2967324292201065};
+        case 7:
+            return {0.784513610477560, 0.235573213359357, -1.17767998417887, 1.31518632068391,
+                    -1.17767998417887, 0.235573213359357, 0.784513610477560};
        default:
-            throw OpenMMException("The number of Yoshida-Suzuki weights must be 1,3, or 5.");
+            throw OpenMMException("The number of Yoshida-Suzuki weights must be 1,3,5, or 7.");
    }
 }
--- a/openmmapi/src/NoseHooverIntegrator.cpp
+++ b/openmmapi/src/NoseHooverIntegrator.cpp
@@ -68,11 +68,6 @@ NoseHooverIntegrator::NoseHooverIntegrator(double temperature, double collisionF

 NoseHooverIntegrator::~NoseHooverIntegrator() {}

-std::pair<double, double> NoseHooverIntegrator::propagateChain(std::pair<double, double> kineticEnergy, int chainID) {
-    return nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, noseHooverChains.at(chainID), kineticEnergy, getStepSize());
-}
-
-
 int NoseHooverIntegrator::addThermostat(double temperature, double collisionFrequency,
                                        int chainLength, int numMTS, int numYoshidaSuzuki) {
    hasSubsystemThermostats_ = false;
@@ -271,10 +266,10 @@ double NoseHooverIntegrator::computeKineticEnergy() {
    double kE = 0.0;
    if(noseHooverChains.size() > 0) {
        for (const auto &nhc: noseHooverChains){
-            kE += nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, true).first;
+            kE += kernel.getAs<IntegrateNoseHooverStepKernel>().computeMaskedKineticEnergy(*context, nhc, true).first;
        }
    } else {
-        kE = vvKernel.getAs<IntegrateVelocityVerletStepKernel>().computeKineticEnergy(*context, *this);
+        kE = kernel.getAs<IntegrateNoseHooverStepKernel>().computeKineticEnergy(*context, *this);
    }
    return kE;
 }
@@ -287,7 +282,7 @@ double NoseHooverIntegrator::computeHeatBathEnergy() {
    double energy = 0;
    for(auto &nhc : noseHooverChains) {
        if (context && (nhc.getNumDegreesOfFreedom() > 0)) {
-            energy += nhcKernel.getAs<NoseHooverChainKernel>().computeHeatBathEnergy(*context, nhc);
+            energy += kernel.getAs<IntegrateNoseHooverStepKernel>().computeHeatBathEnergy(*context, nhc);
        }
    }
    return energy;
@@ -300,10 +295,8 @@ void NoseHooverIntegrator::initialize(ContextImpl& contextRef) {
    context = &contextRef;
    const System& system = context->getSystem();
    owner = &contextRef.getOwner();
-    vvKernel = context->getPlatform().createKernel(IntegrateVelocityVerletStepKernel::Name(), contextRef);
-    vvKernel.getAs<IntegrateVelocityVerletStepKernel>().initialize(contextRef.getSystem(), *this);
-    nhcKernel = context->getPlatform().createKernel(NoseHooverChainKernel::Name(), contextRef);
-    nhcKernel.getAs<NoseHooverChainKernel>().initialize();
+    kernel = context->getPlatform().createKernel(IntegrateNoseHooverStepKernel::Name(), contextRef);
+    kernel.getAs<IntegrateNoseHooverStepKernel>().initialize(contextRef.getSystem(), *this);
    forcesAreValid = false;

    // check for drude particles and build the Nose-Hoover Chains
@@ -329,34 +322,30 @@ void NoseHooverIntegrator::initialize(ContextImpl& contextRef) {
 }

 void NoseHooverIntegrator::cleanup() {
-    vvKernel = Kernel();
-    nhcKernel = Kernel();
+    kernel = Kernel();
 }

 vector<string> NoseHooverIntegrator::getKernelNames() {
    std::vector<std::string> names;
-    names.push_back(NoseHooverChainKernel::Name());
-    names.push_back(IntegrateVelocityVerletStepKernel::Name());
+    names.push_back(IntegrateNoseHooverStepKernel::Name());
    return names;
 }

 void NoseHooverIntegrator::step(int steps) {
    if (context == NULL)
        throw OpenMMException("This Integrator is not bound to a context!");
-    std::pair<double, double> scale, kineticEnergy;
    for (int i = 0; i < steps; ++i) {
        if(context->updateContextState())
            forcesAreValid = false;
-        for(auto &nhc : noseHooverChains) {
-            kineticEnergy = nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, false);
-            scale = nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, nhc, kineticEnergy, getStepSize());
-            nhcKernel.getAs<NoseHooverChainKernel>().scaleVelocities(*context, nhc, scale);
-        }
-        vvKernel.getAs<IntegrateVelocityVerletStepKernel>().execute(*context, *this, forcesAreValid);
-        for(auto &nhc : noseHooverChains) {
-            kineticEnergy = nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, false);
-            scale = nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, nhc, kineticEnergy, getStepSize());
-            nhcKernel.getAs<NoseHooverChainKernel>().scaleVelocities(*context, nhc, scale);
-        }
+        context->calcForcesAndEnergy(true, false);
+        kernel.getAs<IntegrateNoseHooverStepKernel>().execute(*context, *this, forcesAreValid);
    }
 }
+
+void NoseHooverIntegrator::createCheckpoint(std::ostream& stream) const {
+    kernel.getAs<IntegrateNoseHooverStepKernel>().createCheckpoint(*context, stream);
+}
+
+void NoseHooverIntegrator::loadCheckpoint(std::istream& stream) {
+    kernel.getAs<IntegrateNoseHooverStepKernel>().loadCheckpoint(*context, stream);
+}
\ No newline at end of file
--- a/openmmapi/src/SplineFitter.cpp
+++ b/openmmapi/src/SplineFitter.cpp
@@ -86,8 +86,8 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do

    // Create the system of equations to solve.

-    vector<double> a(n), b(n), c(n), rhs(n);
-    a[0] = 0.0;
+    vector<double> a(n-1), b(n-1), c(n-1), rhs(n-1);
+    a[0] = x[n-1]-x[n-2];
    b[0] = 2.0*(x[1]-x[0]+x[n-1]-x[n-2]);
    c[0] = x[1]-x[0];
    rhs[0] = 6.0*((y[1]-y[0])/(x[1]-x[0]) - (y[n-1]-y[n-2])/(x[n-1]-x[n-2]));
@@ -97,17 +97,14 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do
        c[i] = x[i+1]-x[i];
        rhs[i] = 6.0*((y[i+1]-y[i])/(x[i+1]-x[i]) - (y[i]-y[i-1])/(x[i]-x[i-1]));
    }
-    a[n-1] = 0.0;
-    b[n-1] = 1.0;
-    c[n-1] = 0.0;
-    rhs[n-1] = 0.0;
-    double beta = x[n-1]-x[n-2];
-    double alpha = -1.0;
+    double beta = a[0];
+    double alpha = c[n-2];
    double gamma = -b[0];

    // This is a cyclic tridiagonal matrix.  We solve it using the Sherman-Morrison method,
    // which involves solving two tridiagonal systems.

+    n--;
    b[0] -= gamma;
    b[n-1] -= alpha*beta/gamma;
    solveTridiagonalMatrix(a, b, c, rhs, deriv);
@@ -118,6 +115,7 @@ void SplineFitter::createPeriodicSpline(const vector<double>& x, const vector<do
    double scale = (deriv[0]+beta*deriv[n-1]/gamma)/(1.0+z[0]+beta*z[n-1]/gamma);
    for (int i = 0; i < n; i++)
        deriv[i] -= scale*z[i];
+    deriv[n] = deriv[0];
 }

 double SplineFitter::evaluateSpline(const vector<double>& x, const vector<double>& y, const vector<double>& deriv, double t) {

--- a/platforms/common/include/openmm/common/CommonKernels.h
+++ b/platforms/common/include/openmm/common/CommonKernels.h
@@ -944,11 +944,13 @@ private:
 /*
 * This kernel is invoked by NoseHooverIntegrator to take one time step.
 */
-class CommonIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
+class CommonIntegrateNoseHooverStepKernel : public IntegrateNoseHooverStepKernel {
 public:
-    CommonIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, ComputeContext& cc) :
-                                  IntegrateVelocityVerletStepKernel(name, platform), cc(cc), hasInitializedKernels(false) { }
-    ~CommonIntegrateVelocityVerletStepKernel() {}
+    CommonIntegrateNoseHooverStepKernel(std::string name, const Platform& platform, ComputeContext& cc) :
+                                  IntegrateNoseHooverStepKernel(name, platform), cc(cc), hasInitializedKernels(false),
+                                  hasInitializedKineticEnergyKernel(false), hasInitializedHeatBathEnergyKernel(false),
+                                  hasInitializedScaleVelocitiesKernel(false), hasInitializedPropagateKernel(false) {}
+    ~CommonIntegrateNoseHooverStepKernel() {}
    /**
     * Initialize the kernel.
     *
@@ -972,39 +974,16 @@ public:
     * @param integrator the NoseHooverIntegrator this kernel is being used for
     */
    double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
-private:
-    ComputeContext& cc;
-    float prevMaxPairDistance;
-    ComputeArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer;
-    ComputeKernel kernel1, kernel2, kernel3, kernelHardWall;
-    bool hasInitializedKernels;
-};
-
-/**
- * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
- * and update the associated particle velocities.
- */
-class CommonNoseHooverChainKernel : public NoseHooverChainKernel {
-public:
-    CommonNoseHooverChainKernel(std::string name, const Platform& platform, ComputeContext& cc) :
-                 NoseHooverChainKernel(name, platform), cc(cc), hasInitializedPropagateKernel(false),
-                 hasInitializedKineticEnergyKernel(false), hasInitializedHeatBathEnergyKernel(false),
-                 hasInitializedScaleVelocitiesKernel(false) {}
-    ~CommonNoseHooverChainKernel() {}
-    /**
-     * Initialize the kernel.
-     */
-    void initialize();
    /**
     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
     * 
     * @param context  the context in which to execute this kernel
     * @param noseHooverChain the object describing the chain to be propagated.
-     * @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
+     * @param kineticEnergy the {center of mass, relative} kineticEnergies of the particles being thermostated by this chain.
     * @param timeStep the time step used by the integrator.
-     * @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
+     * @return the velocity scale factor to apply to the particles associated with this heat bath.
     */
-    std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
+    std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> kineticEnergy, double timeStep);
    /**
     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
     *
@@ -1012,7 +991,7 @@ public:
     * @param noseHooverChain the chain whose energy is to be determined.
     * @return the total heat bath energy.
     */
-    double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
+    double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain);
    /**
     * Execute the kernel that computes the kinetic energy for a subset of atoms,
     * or the relative kinetic energy of Drude particles with respect to their parent atoms
@@ -1020,22 +999,37 @@ public:
     * @param context the context in which to execute this kernel
     * @param noseHooverChain the chain whose energy is to be determined.
     * @param downloadValue whether the computed value should be downloaded and returned.
-     *
     */
-    std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
-
+    std::pair<double, double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
    /**
     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
     *
     * @param context the context in which to execute this kernel
     * @param noseHooverChain the chain whose energy is to be determined.
-     * @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
+     * @param scaleFactor the multiplicative factor by which {absolute, relative} velocities are scaled.
     */
-    void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);
-
+    void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactor);
+    /**
+     * Write the chain states to a checkpoint.
+     */
+    void createCheckpoint(ContextImpl& context, std::ostream& stream) const;
+    /**
+     * Load the chain states from a checkpoint.
+     */
+    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
-    int sumWorkGroupSize;
    ComputeContext& cc;
+    float prevMaxPairDistance;
+    ComputeArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer, oldDelta;
+    std::map<int, ComputeArray> chainState;
+    ComputeKernel kernel1, kernel2, kernel3, kernel4, kernelHardWall;
+    bool hasInitializedKernels;
+    ComputeKernel reduceEnergyKernel;
+    ComputeKernel computeHeatBathEnergyKernel;
+    ComputeKernel computeAtomsKineticEnergyKernel;
+    ComputeKernel computePairsKineticEnergyKernel;
+    ComputeKernel scaleAtomsVelocitiesKernel;
+    ComputeKernel scalePairsVelocitiesKernel;
    ComputeArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
    std::map<int, ComputeArray> atomlists, pairlists;
    std::map<int, ComputeKernel> propagateKernels;
@@ -1043,12 +1037,6 @@ private:
    bool hasInitializedKineticEnergyKernel;
    bool hasInitializedHeatBathEnergyKernel;
    bool hasInitializedScaleVelocitiesKernel;
-    ComputeKernel reduceEnergyKernel;
-    ComputeKernel computeHeatBathEnergyKernel;
-    ComputeKernel computeAtomsKineticEnergyKernel;
-    ComputeKernel computePairsKineticEnergyKernel;
-    ComputeKernel scaleAtomsVelocitiesKernel;
-    ComputeKernel scalePairsVelocitiesKernel;
 };

 /**

--- a/platforms/common/include/openmm/common/IntegrationUtilities.h
+++ b/platforms/common/include/openmm/common/IntegrationUtilities.h
@@ -130,12 +130,6 @@ public:
     * @param timeShift   the amount by which to shift the velocities in time
     */
    double computeKineticEnergy(double timeShift);
-    /**
-     * Get the data structure that holds the state of all Nose-Hoover chains
-     */
-    std::map<int, ComputeArray>& getNoseHooverChainState() {
-        return noseHooverChainState;
-    }
 protected:
    virtual void applyConstraintsImpl(bool constrainVelocities, double tol) = 0;
    ComputeContext& context;
@@ -174,7 +168,6 @@ protected:
    ComputeArray vsiteLocalCoordsWeights;
    ComputeArray vsiteLocalCoordsPos;
    ComputeArray vsiteLocalCoordsStartIndex;
-    std::map<int, ComputeArray> noseHooverChainState;
    int randomPos, lastSeed, numVsites;
    bool hasOverlappingVsites;
    mm_double2 lastStepSize;

--- a/platforms/common/src/CommonKernels.cpp
+++ b/platforms/common/src/CommonKernels.cpp
@@ -5640,20 +5640,68 @@ double CommonIntegrateLangevinMiddleStepKernel::computeKineticEnergy(ContextImpl
    return cc.getIntegrationUtilities().computeKineticEnergy(0.0);
 }

-void CommonIntegrateVelocityVerletStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
+void CommonIntegrateNoseHooverStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
    cc.initializeContexts();
+    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
    map<string, string> defines;
    defines["BOLTZ"] = cc.doubleToString(BOLTZ);
-    ComputeProgram program = cc.compileProgram(CommonKernelSources::velocityVerlet, defines);
-    kernel1 = program->createKernel("integrateVelocityVerletPart1");
-    kernel2 = program->createKernel("integrateVelocityVerletPart2");
-    kernel3 = program->createKernel("integrateVelocityVerletPart3");
-    kernelHardWall = program->createKernel("integrateVelocityVerletHardWall");
+    ComputeProgram program = cc.compileProgram(CommonKernelSources::noseHooverIntegrator, defines);
+    kernel1 = program->createKernel("integrateNoseHooverMiddlePart1");
+    kernel2 = program->createKernel("integrateNoseHooverMiddlePart2");
+    kernel3 = program->createKernel("integrateNoseHooverMiddlePart3");
+    kernel4 = program->createKernel("integrateNoseHooverMiddlePart4");
+    if (useDouble) {
+        oldDelta.initialize<mm_double4>(cc, cc.getPaddedNumAtoms(), "oldDelta");
+    } else {
+        oldDelta.initialize<mm_float4>(cc, cc.getPaddedNumAtoms(), "oldDelta");
+    }
+    kernelHardWall = program->createKernel("integrateNoseHooverHardWall");
    prevMaxPairDistance = -1.0f;
    maxPairDistanceBuffer.initialize<float>(cc, 1, "maxPairDistanceBuffer");
+
+    int workGroupSize = std::min(cc.getMaxThreadBlockSize(), 512);
+    defines["WORK_GROUP_SIZE"] = std::to_string(workGroupSize);
+
+    defines["BEGIN_YS_LOOP"] = "const real arr[1] = {1.0};"
+                               "for(int i=0;i<1;++i) {"
+                               "const real ys = arr[i];";
+    defines["END_YS_LOOP"] = "}";
+    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    propagateKernels[1] = program->createKernel("propagateNoseHooverChain");
+
+    defines["BEGIN_YS_LOOP"] = "const real arr[3] = {0.828981543588751, -0.657963087177502, 0.828981543588751};"
+                               "for(int i=0;i<3;++i) {"
+                               "const real ys = arr[i];";
+    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    propagateKernels[3] = program->createKernel("propagateNoseHooverChain");
+
+    defines["BEGIN_YS_LOOP"] = "const real arr[5] = {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065};"
+                               "for(int i=0;i<5;++i) {"
+                               "const real ys = arr[i];";
+    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    propagateKernels[5] = program->createKernel("propagateNoseHooverChain");
+
+    defines["BEGIN_YS_LOOP"] = "const real arr[7] = {0.784513610477560, 0.235573213359357, -1.17767998417887, 1.31518632068391,-1.17767998417887, 0.235573213359357, 0.784513610477560};"
+                               "for(int i=0;i<7;++i) {"
+                               "const real ys = arr[i];";
+    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    propagateKernels[7] = program->createKernel("propagateNoseHooverChain");
+    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    reduceEnergyKernel = program->createKernel("reduceEnergyPair");
+
+    computeHeatBathEnergyKernel = program->createKernel("computeHeatBathEnergy");
+    computeAtomsKineticEnergyKernel = program->createKernel("computeAtomsKineticEnergy");
+    computePairsKineticEnergyKernel = program->createKernel("computePairsKineticEnergy");
+    scaleAtomsVelocitiesKernel = program->createKernel("scaleAtomsVelocities");
+    scalePairsVelocitiesKernel = program->createKernel("scalePairsVelocities");
+    int energyBufferSize = cc.getEnergyBuffer().getSize();
+    if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision())
+        energyBuffer.initialize<mm_double2>(cc, energyBufferSize, "energyBuffer");
+    else
+        energyBuffer.initialize<mm_float2>(cc, energyBufferSize, "energyBuffer");
 }

-void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
+void CommonIntegrateNoseHooverStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
    IntegrationUtilities& integration = cc.getIntegrationUtilities();
    int paddedNumAtoms = cc.getPaddedNumAtoms();
    double dt = integrator.getStepSize();
@@ -5700,32 +5748,39 @@ void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, cons
        pairListBuffer.upload(tmp);
        pairTemperatureBuffer.upload(tmp2);
    }
-
+    int totalAtoms = cc.getNumAtoms();
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
        kernel1->addArg(numAtoms);
        kernel1->addArg(numPairs);
        kernel1->addArg(paddedNumAtoms);
-        kernel1->addArg(cc.getIntegrationUtilities().getStepSize());
-        kernel1->addArg(cc.getPosq());
        kernel1->addArg(cc.getVelm());
        kernel1->addArg(cc.getLongForceBuffer());
-        kernel1->addArg(integration.getPosDelta());
+        kernel1->addArg(integration.getStepSize());
        kernel1->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
        kernel1->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
-        if (cc.getUseMixedPrecision())
-            kernel1->addArg(cc.getPosqCorrection());
-        kernel2->addArg(numParticles);
-        kernel2->addArg(cc.getIntegrationUtilities().getStepSize());
-        kernel2->addArg(cc.getPosq());
+        kernel2->addArg(totalAtoms);
        kernel2->addArg(cc.getVelm());
        kernel2->addArg(integration.getPosDelta());
+        kernel2->addArg(oldDelta);
+        kernel2->addArg(integration.getStepSize());
+        kernel3->addArg(totalAtoms);
+        kernel3->addArg(cc.getVelm());
+        kernel3->addArg(integration.getPosDelta());
+        kernel3->addArg(oldDelta);
+        kernel3->addArg(integration.getStepSize());
+        kernel4->addArg(totalAtoms);
+        kernel4->addArg(cc.getPosq());
+        kernel4->addArg(cc.getVelm());
+        kernel4->addArg(integration.getPosDelta());
+        kernel4->addArg(oldDelta);
+        kernel4->addArg(integration.getStepSize());
        if (cc.getUseMixedPrecision())
-            kernel2->addArg(cc.getPosqCorrection());
+            kernel4->addArg(cc.getPosqCorrection());
        if (numPairs > 0) {
            kernelHardWall->addArg(numPairs);
            kernelHardWall->addArg(maxPairDistanceBuffer);
-            kernelHardWall->addArg(cc.getIntegrationUtilities().getStepSize());
+            kernelHardWall->addArg(integration.getStepSize());
            kernelHardWall->addArg(cc.getPosq());
            kernelHardWall->addArg(cc.getVelm());
            kernelHardWall->addArg(pairListBuffer);
@@ -5733,78 +5788,50 @@ void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, cons
            if (cc.getUseMixedPrecision())
                kernelHardWall->addArg(cc.getPosqCorrection());
        }
-        kernel3->addArg(numAtoms);
-        kernel3->addArg(numPairs);
-        kernel3->addArg(paddedNumAtoms);
-        kernel3->addArg(cc.getIntegrationUtilities().getStepSize());
-        kernel3->addArg(cc.getPosq());
-        kernel3->addArg(cc.getVelm());
-        kernel3->addArg(cc.getLongForceBuffer());
-        kernel3->addArg(integration.getPosDelta());
-        kernel3->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
-        kernel3->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
-        if (cc.getUseMixedPrecision())
-            kernel3->addArg(cc.getPosqCorrection());
    }

    /*
-     * Carry out the integration
+     * Carry out the LF-middle integration (c.f. J. Phys. Chem. A 2019, 123, 6056−6079)
     */
-
-    // Advance the velocities a half step
+    // Velocity update
    kernel1->execute(std::max(numAtoms, numPairs));
-    integration.applyConstraints(integrator.getConstraintTolerance());
-    // Advance particle positions a full step
+    integration.applyVelocityConstraints(integrator.getConstraintTolerance());
+    // Position update
    kernel2->execute(numParticles);
+    // Apply the thermostat
+    int numChains = integrator.getNumThermostats();
+    for(int chain = 0; chain < numChains; ++chain) {
+        const auto &thermostatChain = integrator.getThermostat(chain);
+        auto KEs = computeMaskedKineticEnergy(context, thermostatChain, false);
+        auto scaleFactors = propagateChain(context, thermostatChain, KEs, dt);
+        scaleVelocities(context, thermostatChain, scaleFactors);
+    }
+    // Position update
+    kernel3->execute(numParticles);
+    integration.applyConstraints(integrator.getConstraintTolerance());
+    // Apply constraint forces
+    kernel4->execute(numAtoms);
    // Make sure any Drude-like particles have not wandered too far from home
    if (numPairs > 0) kernelHardWall->execute(numPairs);
    integration.computeVirtualSites();
-    context.calcForcesAndEnergy(true, false);
-    forcesAreValid = true;
-    // Update velocities another half step
-    kernel3->execute(std::max(numAtoms, numPairs));
-    integration.applyVelocityConstraints(integrator.getConstraintTolerance());

+    // Update the time and step count.
    cc.setTime(cc.getTime()+dt);
    cc.setStepCount(cc.getStepCount()+1);
    cc.reorderAtoms();
+
+    // Reduce UI lag.
+#ifdef WIN32
+    cc.flushQueue();
+#endif
 }

-double CommonIntegrateVelocityVerletStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
+double CommonIntegrateNoseHooverStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
    return cc.getIntegrationUtilities().computeKineticEnergy(0);
 }

-void CommonNoseHooverChainKernel::initialize() {
-    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
-    map<string, string> defines;
-    int workGroupSize = std::min(cc.getMaxThreadBlockSize(), 512);
-    defines["WORK_GROUP_SIZE"] = std::to_string(workGroupSize);
-    defines["BEGIN_YS_LOOP"] = "const real arr[1] = {1.0}; for(int i=0;i<1;++i) { const real ys = arr[i];";
-    defines["END_YS_LOOP"] = "}";
-    ComputeProgram program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
-    propagateKernels[1] = program->createKernel("propagateNoseHooverChain");
-    defines["BEGIN_YS_LOOP"] = "const real arr[3] = {0.828981543588751, -0.657963087177502, 0.828981543588751}; for(int i=0;i<3;++i) { const real ys = arr[i];";
-    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
-    propagateKernels[3] = program->createKernel("propagateNoseHooverChain");
-    defines["BEGIN_YS_LOOP"] = "const real arr[5] = {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065}; for(int i=0;i<5;++i) { const real ys = arr[i];";
-    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
-    propagateKernels[5] = program->createKernel("propagateNoseHooverChain");
-    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
-    reduceEnergyKernel = program->createKernel("reduceEnergyPair");
-
-    computeHeatBathEnergyKernel = program->createKernel("computeHeatBathEnergy");
-    computeAtomsKineticEnergyKernel = program->createKernel("computeAtomsKineticEnergy");
-    computePairsKineticEnergyKernel = program->createKernel("computePairsKineticEnergy");
-    scaleAtomsVelocitiesKernel = program->createKernel("scaleAtomsVelocities");
-    scalePairsVelocitiesKernel = program->createKernel("scalePairsVelocities");
-    int energyBufferSize = cc.getEnergyBuffer().getSize();
-    if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision())
-        energyBuffer.initialize<mm_double2>(cc, energyBufferSize, "energyBuffer");
-    else
-        energyBuffer.initialize<mm_float2>(cc, energyBufferSize, "energyBuffer");
-}

-std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
+std::pair<double, double> CommonIntegrateNoseHooverStepKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
    int chainID = nhc.getChainID();
    int nAtoms = nhc.getThermostatedAtoms().size();
@@ -5813,12 +5840,10 @@ std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImp
    int numYS = nhc.getNumYoshidaSuzukiTimeSteps();
    int numMTS = nhc.getNumMultiTimeSteps();

-    if (numYS != 1 && numYS != 3 && numYS != 5) {
-        throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
+    if (numYS != 1 && numYS != 3 && numYS != 5 && numYS != 7) {
+        throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, 5, or 7.");
    }

-    auto & chainState = cc.getIntegrationUtilities().getNoseHooverChainState();
-
    if (!scaleFactorBuffer.isInitialized() || scaleFactorBuffer.getSize() == 0) {
        if (useDouble) {
            std::vector<mm_double2> zeros{{0,0}};
@@ -5973,15 +5998,13 @@ std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImp
    return {0, 0};
 }

-double CommonNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
+double CommonIntegrateNoseHooverStepKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {

    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();

    int chainID = nhc.getChainID();
    int chainLength = nhc.getChainLength();

-    auto & chainState = cc.getIntegrationUtilities().getNoseHooverChainState();
-
    bool absChainIsValid = chainState.count(2*chainID) != 0 &&
                           chainState[2*chainID].isInitialized() &&
                           chainState[2*chainID].getSize() == chainLength;
@@ -6058,7 +6081,7 @@ double CommonNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context,
        return *((float*) pinnedBuffer);
 }

-std::pair<double, double> CommonNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
+std::pair<double, double> CommonIntegrateNoseHooverStepKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {

    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();

@@ -6153,7 +6176,7 @@ std::pair<double, double> CommonNoseHooverChainKernel::computeMaskedKineticEnerg
    return KEs;
 }

-void CommonNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
+void CommonIntegrateNoseHooverStepKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
    // For now we assume that the atoms and pairs info is valid, because compute{Atoms|Pairs}KineticEnergy must have been
    // called before this kernel.  If that ever ceases to be true, some sanity checks are needed here.

@@ -6184,6 +6207,54 @@ void CommonNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const No
    }
 }

+void CommonIntegrateNoseHooverStepKernel::createCheckpoint(ContextImpl& context, ostream& stream) const {
+    int numChains = chainState.size();
+    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
+    stream.write((char*) &numChains, sizeof(int));
+    for (auto& state : chainState){
+        int chainID = state.first;
+        int chainLength = state.second.getSize();
+        stream.write((char*) &chainID, sizeof(int));
+        stream.write((char*) &chainLength, sizeof(int));
+        if (useDouble) {
+            vector<mm_double2> stateVec;
+            state.second.download(stateVec);
+            stream.write((char*) stateVec.data(), sizeof(mm_double2)*chainLength);
+        }
+        else {
+            vector<mm_float2> stateVec;
+            state.second.download(stateVec);
+            stream.write((char*) stateVec.data(), sizeof(mm_float2)*chainLength);
+        }
+    }
+}
+
+void CommonIntegrateNoseHooverStepKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
+    int numChains;
+    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
+    stream.read((char*) &numChains, sizeof(int));
+    chainState.clear();
+    for (int i = 0; i < numChains; i++) {
+        int chainID, chainLength;
+        stream.read((char*) &chainID, sizeof(int));
+        stream.read((char*) &chainLength, sizeof(int));
+        if (useDouble) {
+            chainState[chainID] = ComputeArray();
+            chainState[chainID].initialize<mm_double2>(cc, chainLength, "chainState" + to_string(chainID));
+            vector<mm_double2> stateVec(chainLength);
+            stream.read((char*) &stateVec[0], sizeof(mm_double2)*chainLength);
+            chainState[chainID].upload(stateVec);
+        }
+        else {
+            chainState[chainID] = ComputeArray();
+            chainState[chainID].initialize<mm_float2>(cc, chainLength, "chainState" + to_string(chainID));
+            vector<mm_float2> stateVec(chainLength);
+            stream.read((char*) &stateVec[0], sizeof(mm_float2)*chainLength);
+            chainState[chainID].upload(stateVec);
+        }
+    }
+}
+
 void CommonIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
    cc.initializeContexts();
    cc.setAsCurrent();

--- a/platforms/common/src/IntegrationUtilities.cpp
+++ b/platforms/common/src/IntegrationUtilities.cpp
@@ -753,25 +753,6 @@ int IntegrationUtilities::prepareRandomNumbers(int numValues) {
 }

 void IntegrationUtilities::createCheckpoint(ostream& stream) {
-    int numChains = noseHooverChainState.size();
-    bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
-    stream.write((char*) &numChains, sizeof(int));
-    for (auto &chainState: noseHooverChainState){
-        int chainID = chainState.first;
-        int chainLength = chainState.second.getSize();
-        stream.write((char*) &chainID, sizeof(int));
-        stream.write((char*) &chainLength, sizeof(int));
-        if (useDouble) {
-            vector<mm_double2> stateVec;
-            chainState.second.download(stateVec);
-            stream.write((char*) stateVec.data(), sizeof(mm_double2)*chainLength);
-        }
-        else {
-            vector<mm_float2> stateVec;
-            chainState.second.download(stateVec);
-            stream.write((char*) stateVec.data(), sizeof(mm_float2)*chainLength);
-        }
-    }
    if (!random.isInitialized())
        return;
    stream.write((char*) &randomPos, sizeof(int));
@@ -784,29 +765,6 @@ void IntegrationUtilities::createCheckpoint(ostream& stream) {
 }

 void IntegrationUtilities::loadCheckpoint(istream& stream) {
-    int numChains;
-    bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
-    stream.read((char*) &numChains, sizeof(int));
-    noseHooverChainState.clear();
-    for (int i = 0; i < numChains; i++) {
-        int chainID, chainLength;
-        stream.read((char*) &chainID, sizeof(int));
-        stream.read((char*) &chainLength, sizeof(int));
-        if (useDouble) {
-            noseHooverChainState[chainID] = ComputeArray();
-            noseHooverChainState[chainID].initialize<mm_double2>(context, chainLength, "chainState" + to_string(chainID));
-            vector<mm_double2> stateVec(chainLength);
-            stream.read((char*) &stateVec[0], sizeof(mm_double2)*chainLength);
-            noseHooverChainState[chainID].upload(stateVec);
-        }
-        else {
-            noseHooverChainState[chainID] = ComputeArray();
-            noseHooverChainState[chainID].initialize<mm_float2>(context, chainLength, "chainState" + to_string(chainID));
-            vector<mm_float2> stateVec(chainLength);
-            stream.read((char*) &stateVec[0], sizeof(mm_float2)*chainLength);
-            noseHooverChainState[chainID].upload(stateVec);
-        }
-    }
    if (!random.isInitialized())
        return;
    stream.read((char*) &randomPos, sizeof(int));

--- a/platforms/common/src/kernels/cmapTorsionForce.cc
+++ b/platforms/common/src/kernels/cmapTorsionForce.cc
@@ -61,8 +61,8 @@ angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
 int2 pos = MAP_POS[MAPS[index]];
 int size = pos.y;
 real delta = 2*PI/size;
-int s = (int) (angleA/delta);
-int t = (int) (angleB/delta);
+int s = (int) fmin(angleA/delta, size-1);
+int t = (int) fmin(angleB/delta, size-1);
 float4 c[4];
 int coeffIndex = pos.x+4*(s+size*t);
 c[0] = COEFF[coeffIndex];

--- a/platforms/common/src/kernels/customIntegratorPerDof.cc
+++ b/platforms/common/src/kernels/customIntegratorPerDof.cc
@@ -3,8 +3,8 @@ typedef double TempType;
 typedef double3 TempType3;
 typedef double4 TempType4;

-#define make_TempType3(a...) make_double3(a)
-#define make_TempType4(a...) make_double4(a)
+#define make_TempType3(...) make_double3(__VA_ARGS__)
+#define make_TempType4(...) make_double4(__VA_ARGS__)
 #define convertToTempType3(a) make_double3((a).x, (a).y, (a).z)
 #define convertToTempType4(a) make_double4((a).x, (a).y, (a).z, (a).w)

@@ -16,8 +16,8 @@ typedef float TempType;
 typedef float3 TempType3;
 typedef float4 TempType4;

-#define make_TempType3(a...) make_float3(a)
-#define make_TempType4(a...) make_float4(a)
+#define make_TempType3(...) make_float3(__VA_ARGS__)
+#define make_TempType4(...) make_float4(__VA_ARGS__)
 #define convertToTempType3(a) make_float3((a).x, (a).y, (a).z)
 #define convertToTempType4(a) make_float4((a).x, (a).y, (a).z, (a).w)
 #endif

--- a/platforms/common/src/kernels/langevinMiddle.cc
+++ b/platforms/common/src/kernels/langevinMiddle.cc
@@ -28,7 +28,7 @@ KERNEL void integrateLangevinMiddlePart2(int numAtoms, GLOBAL mixed4* RESTRICT v
        ) {
    mixed vscale = paramBuffer[VelScale];
    mixed noisescale = paramBuffer[NoiseScale];
-    mixed halfdt = 0.5*dt[0].y;
+    mixed halfdt = 0.5f*dt[0].y;
    int index = GLOBAL_ID;
    randomIndex += index;
    while (index < numAtoms) {

--- a/platforms/common/src/kernels/noseHooverChain.cc
+++ b/platforms/common/src/kernels/noseHooverChain.cc
+// Propagates a Nose Hoover chain a full timestep
 KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL const mixed2 * RESTRICT energySum, GLOBAL mixed2* RESTRICT scaleFactor,
                                     GLOBAL mixed* RESTRICT chainMasses, GLOBAL mixed* RESTRICT chainForces, int chainType, int chainLength, int numMTS,
                                     int numDOFs, float timeStep, mixed kT, float frequency){
@@ -15,30 +16,28 @@ KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL c
    for (int mts = 0; mts < numMTS; ++mts) {
        BEGIN_YS_LOOP
            mixed wdt = ys * timeOverMTS;
-            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+            chainData[chainLength-1].y += 0.5f * wdt * chainForces[chainLength-1];
            for (int bead = chainLength - 2; bead >= 0; --bead) {
-                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
-                chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
+                mixed aa = exp(-0.25f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (chainData[bead].y * aa + 0.5f * wdt * chainForces[bead]);
            }
            // update particle velocities
-            mixed aa = EXP(-0.5f * wdt * chainData[0].y);
-            scale *= aa;
+            scale *= (mixed) exp(-wdt * chainData[0].y);;
            // update the thermostat positions
            for (int bead = 0; bead < chainLength; ++bead) {
-                chainData[bead].x += 0.5f * chainData[bead].y * wdt;
+                chainData[bead].x += chainData[bead].y * wdt;
            }
            // update the forces
            chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
            // update thermostat velocities
            for (int bead = 0; bead < chainLength - 1; ++bead) {
-                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
-                chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
+                mixed aa = exp(-0.25f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (aa * chainData[bead].y + 0.5f * wdt * chainForces[bead]);
                chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
            }
-            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+            chainData[chainLength-1].y += 0.5f * wdt * chainForces[chainLength-1];
        END_YS_LOOP
    } // MTS loop
-
    if (chainType == 0) {
        scaleFactor[0].x = scale;
    } else {