[WIP] CPU: Refactored code to be generic across vector CPU platforms. (#2661)

* CPU: Refactored code to be generic across vector CPU platforms. Ewald and non-Ewald interactions now share a common code base, templated on their interaction type. The vec4 and vec8 implementations have been replaced by a single generic implementation class which is templated on SIMD type. Currently works for SIMD4 and SIMD8 types, but can be extended in future to support other types (e.g., AVX-512). Modified runtime CPU support to lay groundwork for future SIMD types. Pulled out some vector utility functions (gather pair, reduce), and refactored the AVX CPU code to make use of them. * CPU: Fixed coding standards and incorrect header include. * CPU: Fixed code review comments from PR #2661 * CPU: Fixed CI build issues. * CPU: Further CI fixes. * CPU: Fix for unit test failure on MacOS. Reverted optimised code to go back to a version which is thought to work on MacOS. The optimisation will be ...

[WIP] CPU: Refactored code to be generic across vector CPU platforms. (#2661)
* CPU: Refactored code to be generic across vector CPU platforms. Ewald and non-Ewald interactions now share a common code base, templated on their interaction type. The vec4 and vec8 implementations have been replaced by a single generic implementation class which is templated on SIMD type. Currently works for SIMD4 and SIMD8 types, but can be extended in future to support other types (e.g., AVX-512). Modified runtime CPU support to lay groundwork for future SIMD types. Pulled out some vector utility functions (gather pair, reduce), and refactored the AVX CPU code to make use of them. * CPU: Fixed coding standards and incorrect header include. * CPU: Fixed code review comments from PR #2661 * CPU: Fixed CI build issues. * CPU: Further CI fixes. * CPU: Fix for unit test failure on MacOS. Reverted optimised code to go back to a version which is thought to work on MacOS. The optimisation will be ...
91952b51 · dwtowner · GitHub · 01ead86e · 91952b51 · 91952b51
Unverified Commit 91952b51 authored Apr 30, 2020 by dwtowner Committed by GitHub Apr 30, 2020
16 changed files
--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
@@ -51,6 +51,18 @@ public:
    fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
    fvec8(__m256 v) : val(v) {}
    fvec8(const float* v) : val(_mm256_loadu_ps(v)) {}
+    /** Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+     * be loaded from table[idx[i]].
+     * @param table The table from which to do a lookup.
+     * @param indexes The indexes to gather.
+     */
+    fvec8(const float* table, const int idx[8]) {
+        // :TODO: Using int32_t explicitly as the index type could allow the real gather instruction to be used.
+        // Use gather and static assert? Conditional code?
+        val = _mm256_setr_ps(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]], table[idx[4]], table[idx[5]], table[idx[6]], table[idx[7]]);
+    }
    operator __m256() const {
        return val;
    }
@@ -115,6 +127,12 @@ public:
        return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
    }
    operator ivec8() const;
+    /**
+     * Convert an integer bitmask into a full vector of elements which can be used
+     * by the blend function.
+     */
+    static fvec8 expandBitsToMask(int bitmask);
 };
 /**
@@ -160,6 +178,19 @@ inline ivec8::operator fvec8() const {
    return _mm256_cvtepi32_ps(val);
 }
+inline fvec8 fvec8::expandBitsToMask(int bitmask) {
+    // Put a copy of bit 0 in the first element, bit 1 in the second, and so on.
+    const auto expandedBits =
+      _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi8(bitmask)),
+                    _mm256_castsi256_ps(_mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128)));
+    // The individual bits are essentially extremely small floating-point values. By comparing against zero
+    //  (even a floating-point zero), the individual bits are turned into a complete element mask.
+    const auto elementMask = _mm256_cmp_ps(expandedBits, __m256(), _CMP_NEQ_OQ);
+    return elementMask;
+}
 // Functions that operate on fvec8s.
 static inline fvec8 floor(const fvec8& v) {
@@ -208,6 +239,11 @@ static inline float dot8(const fvec8& v1, const fvec8& v2) {
    return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
 }
+static inline float reduceAdd(const fvec8 v) {
+    // :TODO: There are more efficient ways to do this.
+    return dot8(v, fvec8(1.0f));
+}
 static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
    fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
    fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
@@ -231,6 +267,14 @@ static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in
    out4 = _mm256_insertf128_ps(out4, i8, 1);
 }
+/** Given a vec4[8] input array, generate 4 vec8 outputs. The first output contains all the first elements
+ * the second output the second elements, and so on. Note that the prototype is essentially differing only
+ * in output type so it can be overloaded in other SIMD fvec types.
+ */
+static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
+    transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4);
+}
 static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
    out1 = in1.lowerVec();
    out2 = in2.lowerVec();
@@ -244,6 +288,13 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
    _MM_TRANSPOSE4_PS(out5, out6, out7, out8);
 }
+/**
+ * Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
+ */
+static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4 out[8]) {
+    transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+}
 // Functions that operate on ivec8s.
 static inline bool any(const ivec8& v) {
@@ -268,10 +319,97 @@ static inline fvec8 operator/(float v1, const fvec8& v2) {
    return fvec8(v1)/v2;
 }
-// Operations for blending fvec8s based on an ivec8.
+// Operation for blending fvec8 from a full bitmask.
+static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const fvec8& mask) {
+    return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val));
+}
+static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
+    return blend(0.0f, v, mask);
+}
-static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const ivec8& mask) {
+/**
-    return fvec8(_mm256_blendv_ps(v1.val, v2.val, _mm256_castsi256_ps(mask.val)));
+ * Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& out0, fvec8& out1) {
+    // Gather all the separate memory data together. Each vector will have two values
+    // which get used, and two which are ultimately discarded.
+    fvec4 t0(table + _mm256_extract_epi32(index, 0));
+    fvec4 t1(table + _mm256_extract_epi32(index, 1));
+    fvec4 t2(table + _mm256_extract_epi32(index, 2));
+    fvec4 t3(table + _mm256_extract_epi32(index, 3));
+    fvec4 t4(table + _mm256_extract_epi32(index, 4));
+    fvec4 t5(table + _mm256_extract_epi32(index, 5));
+    fvec4 t6(table + _mm256_extract_epi32(index, 6));
+    fvec4 t7(table + _mm256_extract_epi32(index, 7));
+    // Tranposing the 8 vectors above will put all the first elements into one output
+    // vector, all the second elements into the next vector and so on.
+    fvec8 discard0, discard1;
+    transpose(t0, t1, t2, t3, t4, t5, t6, t7, out0, out1, discard0, discard1);
+}
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3 X4 X5 X6 X7
+ *   Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
+ *   Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + ...)
+ *   output[1] = (Y0 + Y1 + Y2 + ...)
+ *   output[2] = (Z0 + Z1 + Z2 + ...)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) {
+    // The general strategy for a vector reduce-add operation is to take values from
+    // different parts of the vector and overlap them a different part of the vector and then
+    // add together. Repeat this several times until all values have been summed. Initially 8
+    // values can be reduced to 4, 4 to 2, and 2 to 1. The following code essentially does this
+    // but exploits two things:
+    //   - having multiple inputs means that some vectors can be combined together to amortise the
+    //     cost of shuffling.
+    //   - the output destinations are part of anther vector, so accumulate into the correct
+    //     offsets to start with, instead of reducing to position 0 and re-inserting to the correct
+    //     output location.
+    //
+    // As far as possible, accumulate x, y and z into their output positions in both the top and
+    // bottom 128-bits to exploit in-lane permutes as much as possible early on.
+    // Shuffle X and Z together to form one reduced vector.
+    //   X2 X3 Z0 Z1 X6 X7 Z4 Z5
+    const auto xzshuf = _mm256_shuffle_ps(x, z, 0b01001110);
+    // Blend X and Z together to form another reduced vector, overlapping the previous.
+    //   X0 X1 Z2 Z3 X4 X5 Z6 Z7
+    const auto xzblend = _mm256_blend_ps(x, z, 0b11001100);
+    // Add them together to form:
+    // (X0 + X2) (X1 + X3) (Z0 + Z2) (Z1 + Z3) etc.
+    const auto xz0 = _mm256_add_ps(xzshuf, xzblend);
+    // Now there's only one vector containing all values. Shuffle again to form another overlap,
+    // and then add.
+    const auto xz1 = _mm256_permute_ps(xz0, 0b00110001);
+    const auto xz2 = _mm256_add_ps(xz0, xz1);
+    // Work on Z on its own as there's nothing else to work with. Start by permuting it to
+    // form some overlaps, and then add:
+    //   (Y0 + Y2) (Y1 + Y3) - - (Y4 + Y6) (Y5 + Y7) - -
+    const auto yshuf = _mm256_permute_ps(y, 0b11101110);
+    const auto y0 = _mm256_add_ps(yshuf, y);
+    // Shift the bottom float of each pair to the right, into the correct Y location.
+    const auto y1 = _mm256_permute_ps(y0, 0b00000000);
+    const auto y2 = _mm256_add_ps(y0, y1);
+    // Blend the results together to give a complete set of XYZ in the correct respective positions
+    // of both top and bottom 128-bit lanes.
+    const auto laneResult = fvec8(_mm256_blend_ps(xz2, y2, 0b00100010));
+    return laneResult.lowerVec() + laneResult.upperVec();
 }
 #endif /*OPENMM_VECTORIZE8_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -85,6 +85,16 @@ public:
    operator float32x4_t() const {
        return val;
    }
+    /**
+      * Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+      * be loaded from table[idx[i]].
+      * @param table The table from which to do a lookup.
+      * @param indexes The indexes to gather.
+      */
+    fvec4(const float* table, const int idx[4])
+        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
    float operator[](int i) const {
        switch (i) {
            case 0:
@@ -101,6 +111,16 @@ public:
    void store(float* v) const {
        vst1q_f32(v, val);
    }
+    /**
+     * Store only the lower three elements of the vector.
+     */
+    void storeVec3(float* v) const {
+        v[0] = vgetq_lane_f32(val, 0);
+        v[1] = vgetq_lane_f32(val, 1);
+        v[2] = vgetq_lane_f32(val, 2);
+    }
    fvec4 operator+(const fvec4& other) const {
        return vaddq_f32(val, other);
    }
@@ -159,6 +179,13 @@ public:
        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
    }
    operator ivec4() const;
+    /**
+     * Convert an integer bitmask into a full vector of elements which can be used
+     * by the blend function.
+     */
+    static ivec4 expandBitsToMask(int bitmask);
 };
 /**
@@ -254,6 +281,12 @@ inline ivec4::operator fvec4() const {
    return fvec4(vcvtq_f32_s32(val));
 }
+inline ivec4 fvec4::expandBitsToMask(int bitmask) {
+    return ivec4(bitmask & 1 ? -1 : 0,
+                 bitmask & 2 ? -1 : 0,
+                 bitmask & 4 ? -1 : 0,
+                 bitmask & 8 ? -1 : 0);
+}
 // Functions that operate on fvec4s.
 static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
@@ -297,6 +330,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
 }
+static inline float reduceAdd(const fvec4 v) {
+    return dot4(v, fvec4(1.0f));
+}
 static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
    return fvec4(v1[1]*v2[2] - v1[2]*v2[1],
                 v1[2]*v2[0] - v1[0]*v2[2],
@@ -314,6 +351,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
    v4 = t4.val[1];
 }
+/**
+ * Out-of-place transpose from an array into named variables.
+ */
+static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
+    v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
+    transpose(v0, v1, v2, v3);
+}
+/**
+ * Out-of-place transpose from named variables into an array.
+ */
+static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
+    transpose(out[0], out[1], out[2], out[3]);
+}
 // Functions that operate on ivec4s.
 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
@@ -356,6 +409,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
    return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
 }
+static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+    return blend(0.0f, v, mask);
+}
 // These are at the end since they involve other functions defined above.
 static inline fvec4 round(const fvec4& v) {
@@ -374,4 +431,38 @@ static inline fvec4 ceil(const fvec4& v) {
    return rounded + blend(0.0f, 1.0f, rounded<v);
 }
+/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+    fvec4 t0(table + index[0]);
+    fvec4 t1(table + index[1]);
+    fvec4 t2(table + index[2]);
+    fvec4 t3(table + index[3]);
+    transpose(t0, t1, t2, t3);
+    out0 = t0;
+    out1 = t1;
+}
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3
+ *   Y0 Y1 Y2 Y3
+ *   Z0 Z1 Z2 Z3
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + X3)
+ *   output[1] = (Y0 + Y1 + Y2 + Y3)
+ *   output[2] = (Z0 + Z1 + Z2 + Z3)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+    const auto nx = reduceAdd(x);
+    const auto ny = reduceAdd(y);
+    const auto nz = reduceAdd(z);
+    return fvec4(nx, ny, nz, 0.0);
+}
 #endif /*OPENMM_VECTORIZE_NEON_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -67,6 +67,16 @@ public:
    fvec4(const float* v) {
        val = *((__m128*) v);
    }
+    /**
+     * Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+     * be loaded from table[idx[i]].
+     * @param table The table from which to do a lookup.
+     * @param indexes The indexes to gather.
+     */
+    fvec4(const float* table, const int idx[4])
+        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
    operator __m128() const {
        return val;
    }
@@ -76,6 +86,15 @@ public:
    void store(float* v) const {
        *((__m128*) v) = val;
    }
+    /**
+     * Store only the lower three elements of the vector.
+     */
+    void storeVec3(float* v) const {
+        v[0] = val[0];
+        v[1] = val[1];
+        v[2] = val[2];
+    }
    fvec4 operator+(const fvec4& other) const {
        return val+other;
    }
@@ -116,6 +135,13 @@ public:
    ivec4 operator>=(const fvec4& other) const;
    ivec4 operator<=(const fvec4& other) const;
    operator ivec4() const;
+    /**
+     * Convert an integer bitmask into a full vector of elements which can be used
+     * by the blend function.
+     */
+    static ivec4 expandBitsToMask(int bitmask);
 };
 /**
@@ -227,6 +253,13 @@ inline ivec4::operator fvec4() const {
    return __builtin_convertvector(val, __m128);
 }
+inline ivec4 fvec4::expandBitsToMask(int bitmask) {
+    return ivec4(bitmask & 1 ? -1 : 0,
+                 bitmask & 2 ? -1 : 0,
+                 bitmask & 4 ? -1 : 0,
+                 bitmask & 8 ? -1 : 0);
+}
 // Functions that operate on fvec4s.
 static inline fvec4 abs(const fvec4& v) {
@@ -252,6 +285,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return temp[0]+temp[1];
 }
+static inline float reduceAdd(const fvec4 v) {
+    return dot4(v, fvec4(1.0f));
+}
 static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
    __m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) -
                  v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3);
@@ -269,6 +306,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
    v4 = __builtin_shufflevector(a2, a4, 2, 3, 6, 7);
 }
+/**
+ * Out-of-place transpose from an array into named variables.
+ */
+static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
+    v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
+    transpose(v0, v1, v2, v3);
+}
+/**
+ * Out-of-place transpose from named variables into an array.
+ */
+static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
+    transpose(out[0], out[1], out[2], out[3]);
+}
 // Functions that operate on ivec4s.
 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
@@ -312,6 +365,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask)
    return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1));
 }
+static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+    return blend(0.0f, v, mask);
+}
 // These are at the end since they involve other functions defined above.
 static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
@@ -358,5 +415,40 @@ static inline fvec4 sqrt(const fvec4& v) {
    return rsqrt(v)*v;
 }
+/**
+ * Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+    fvec4 t0(table + index[0]);
+    fvec4 t1(table + index[1]);
+    fvec4 t2(table + index[2]);
+    fvec4 t3(table + index[3]);
+    transpose(t0, t1, t2, t3);
+    out0 = t0;
+    out1 = t1;
+}
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3
+ *   Y0 Y1 Y2 Y3
+ *   Z0 Z1 Z2 Z3
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + X3)
+ *   output[1] = (Y0 + Y1 + Y2 + Y3)
+ *   output[2] = (Z0 + Z1 + Z2 + Z3)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+    const auto nx = reduceAdd(x);
+    const auto ny = reduceAdd(y);
+    const auto nz = reduceAdd(z);
+    return fvec4(nx, ny, nz, 0.0);
+}
 #endif /*OPENMM_VECTORIZE_PNACL_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_ppc.h
+++ b/openmmapi/include/openmm/internal/vectorize_ppc.h
@@ -68,6 +68,16 @@ public:
    fvec4(const float* v) {
        val = *((__m128*) v);
    }
+    /**
+      * Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+      * be loaded from table[idx[i]].
+      * @param table The table from which to do a lookup.
+      * @param indexes The indexes to gather.
+      */
+    fvec4(const float* table, const int idx[4])
+        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
    operator __m128() const {
        return val;
    }
@@ -77,6 +87,16 @@ public:
    void store(float* v) const {
        *((__m128*) v) = val;
    }
+    /**
+      * Store only the lower three elements of the vector.
+      */
+    void storeVec3(float* v) const {
+        v[0] = val[0];
+        v[1] = val[1];
+        v[2] = val[2];
+    }
    fvec4 operator+(const fvec4& other) const {
        return vec_add(val, other.val);
    }
@@ -117,6 +137,13 @@ public:
    ivec4 operator>=(const fvec4& other) const;
    ivec4 operator<=(const fvec4& other) const;
    operator ivec4() const;
+    /***
+      * Convert an integer bitmask into a full vector of elements which can be used
+      * by the blend function.
+      */
+    static ivec4 expandBitsToMask(int bitmask);
 };
 /**
@@ -228,6 +255,13 @@ inline ivec4::operator fvec4() const {
    return (__m128) {(float)val[0], (float)val[1], (float)val[2], (float)val[3]};
 }
+inline ivec4 fvec4::expandBitsToMask(int bitmask) {
+    return ivec4(bitmask & 1 ? -1 : 0,
+                 bitmask & 2 ? -1 : 0,
+                 bitmask & 4 ? -1 : 0,
+                 bitmask & 8 ? -1 : 0);
+}
 // Functions that operate on fvec4s.
 static inline fvec4 abs(const fvec4& v) {
@@ -253,6 +287,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return temp[0]+temp[1];
 }
+static inline float reduceAdd(const fvec4 v) {
+    return dot4(v, fvec4(1.0f));
+}
 static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
    vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15};
    __m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) -
@@ -275,6 +313,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
    v4 = vec_perm(a2, a4, perm4);
 }
+/**
+ * Out-of-place transpose from an array into named variables.
+ */
+static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
+    v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
+    transpose(v0, v1, v2, v3);
+}
+/**
+ * Out-of-place transpose from named variables into an array.
+ */
+static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
+    transpose(out[0], out[1], out[2], out[3]);
+}
 // Functions that operate on ivec4s.
 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
@@ -289,8 +343,8 @@ static inline ivec4 abs(const ivec4& v) {
    return vec_abs(v.val);
 }
-static inline bool any(const __m128i& v) {
+static inline bool any(const ivec4 v) {
-    return !vec_all_eq(v, ivec4(0).val);
+    return !vec_all_eq(v.val, ivec4(0).val);
 }
 // Mathematical operators involving a scalar and a vector.
@@ -317,6 +371,10 @@ static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask)
    return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val);
 }
+static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+    return blend(0.0f, v, mask);
+}
 // These are at the end since they involve other functions defined above.
 static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
@@ -355,5 +413,39 @@ static inline fvec4 sqrt(const fvec4& v) {
    return vec_sqrt(v.val);
 }
+/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+    fvec4 t0(table + index[0]);
+    fvec4 t1(table + index[1]);
+    fvec4 t2(table + index[2]);
+    fvec4 t3(table + index[3]);
+    transpose(t0, t1, t2, t3);
+    out0 = t0;
+    out1 = t1;
+}
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3
+ *   Y0 Y1 Y2 Y3
+ *   Z0 Z1 Z2 Z3
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + X3)
+ *   output[1] = (Y0 + Y1 + Y2 + Y3)
+ *   output[2] = (Z0 + Z1 + Z2 + Z3)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+    const auto nx = reduceAdd(x);
+    const auto ny = reduceAdd(y);
+    const auto nz = reduceAdd(z);
+    return fvec4(nx, ny, nz, 0.0);
+}
 #endif /*OPENMM_VECTORIZE_PPC_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -32,7 +32,12 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
+#ifdef __AVX__
+#include <immintrin.h>
+#else
 #include <smmintrin.h>
+#endif
 #include "hardware.h"
 // This file defines classes and functions to simplify vectorizing code with SSE.
@@ -68,6 +73,16 @@ public:
    fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
    fvec4(__m128 v) : val(v) {}
    fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
+    /**
+     * Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+     * be loaded from table[idx[i]].
+     * @param table The table from which to do a lookup.
+     * @param indexes The indexes to gather.
+     */
+    fvec4(const float* table, const int idx[4])
+        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
    operator __m128() const {
        return val;
    }
@@ -79,6 +94,20 @@ public:
    void store(float* v) const {
        _mm_storeu_ps(v, val);
    }
+    /**
+     * Store only the lower three elements of the vector.
+     */
+    void storeVec3(float* v) const {
+        // This code could be called from objects compiled for better SIMD domains (e.g., AVX) so conditionally
+        // compile in the most efficient variant of the instruction.
+#ifdef  __AVX__
+        _mm_maskstore_ps(v, _mm_setr_epi32(-1, -1, -1, 0), val);
+#else
+        _mm_maskmoveu_si128 (_mm_castps_si128(val), _mm_setr_epi32(-1, -1, -1, 0), (char*)v);
+#endif
+    }
    fvec4 operator+(const fvec4& other) const {
        return _mm_add_ps(val, other);
    }
@@ -131,6 +160,12 @@ public:
        return _mm_cmple_ps(val, other);
    }
    operator ivec4() const;
+    /**
+     * Convert an integer bitmask into a full vector of elements which can be used
+     * by the blend function.
+     */
+    static fvec4 expandBitsToMask(int bitmask);
 };
 /**
@@ -214,6 +249,13 @@ inline ivec4::operator fvec4() const {
    return _mm_cvtepi32_ps(val);
 }
+inline fvec4 fvec4::expandBitsToMask(int bitmask) {
+    // Not optimal for SSE (see AVX implementation for better version)
+    // but useful as an example for other SIMD architectures.
+    const auto values = fvec4(bitmask & 1, bitmask & 2, bitmask & 4, bitmask & 8);
+    return values != fvec4(0.0f);
+}
 // Functions that operate on fvec4s.
 static inline fvec4 floor(const fvec4& v) {
@@ -273,6 +315,10 @@ static inline float dot4(const fvec4& v1, const fvec4& v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
 }
+static inline float reduceAdd(const fvec4 v) {
+    return dot4(v, fvec4(1.0f));
+}
 static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
    fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) -
                 fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1))));
@@ -283,6 +329,22 @@ static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
    _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
 }
+/**
+ * Out-of-place transpose from an array into named variables.
+ */
+static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, fvec4& v3) {
+    v0 = in[0]; v1 = in[1]; v2 = in[2]; v3 = in[3];
+    transpose(v0, v1, v2, v3);
+}
+/**
+ * Out-of-place transpose from named variables into an array.
+ */
+static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
+    transpose(out[0], out[1], out[2], out[3]);
+}
 // Functions that operate on ivec4s.
 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
@@ -319,10 +381,48 @@ static inline fvec4 operator/(float v1, const fvec4& v2) {
    return fvec4(v1)/v2;
 }
-// Operations for blending fvec4s based on an ivec4.
+// Operations for blending fvec4
+static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const fvec4& mask) {
+    return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val));
+}
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
+static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
-    return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
+    return blend(0.0f, v, mask);
+}
+/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+    fvec4 t0(table + index[0]);
+    fvec4 t1(table + index[1]);
+    fvec4 t2(table + index[2]);
+    fvec4 t3(table + index[3]);
+    transpose(t0, t1, t2, t3);
+    out0 = t0;
+    out1 = t1;
+}
+/**
+ * Given 3 vectors of floating-point data, reduce them to a single 3-element position
+ * value by adding all the elements in each vector. Given inputs of:
+ *   X0 X1 X2 X3
+ *   Y0 Y1 Y2 Y3
+ *   Z0 Z1 Z2 Z3
+ * Each vector of values needs to be summed into a single value, and then stored into
+ * the output vector:
+ *   output[0] = (X0 + X1 + X2 + X3)
+ *   output[1] = (Y0 + Y1 + Y2 + Y3)
+ *   output[2] = (Z0 + Z1 + Z2 + Z3)
+ *   output[3] = undefined
+ */
+static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+    // :TODO: Could be made more efficient.
+    const auto nx = reduceAdd(x);
+    const auto ny = reduceAdd(y);
+    const auto nz = reduceAdd(z);
+    return fvec4(nx, ny, nz, 0.0);
 }
 #endif /*OPENMM_VECTORIZE_SSE_H_*/

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -205,7 +205,7 @@ protected:
        static const float TWO_OVER_SQRT_PI;
        static const int NUM_TABLE_POINTS;
      /**---------------------------------------------------------------------------------------
         Calculate LJ Coulomb pair ixn between two atoms

--- a/platforms/cpu/include/CpuNonbondedForceFvec.h
+++ b/platforms/cpu/include/CpuNonbondedForceFvec.h
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+ * Contributors: Daniel Towner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
+#define OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
+#include "CpuNonbondedForce.h"
+#include "openmm/internal/vectorize.h"
+#include "SimTKOpenMMUtilities.h"
+#include <algorithm>
+#include <vector>
+namespace OpenMM {
+enum BlockType {EWALD, NON_EWALD}; // :TODO: Better name for non-ewald.
+enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
+/**
+ * Generic SIMD implementation of CpuNonbondedForce. The templating allows the same
+ * basic code to be reused for any sort of SIMD type, including SSE, AVX, AVX2, or
+ * AVX-512.
+ */
+template<typename FVEC>
+class CpuNonbondedForceFvec : public CpuNonbondedForce {
+public:
+    /**
+     * Store how many elements are contained in each block of atoms.
+     */
+    static constexpr int blockSize = sizeof(FVEC) / sizeof(float);
+protected:
+    /**---------------------------------------------------------------------------------------
+      Calculate all the interactions for one atom block. These are part of the virtual function interface
+      and consequently have names which explicitly call Ewald variant or not.
+      They internally call into the generic handler function below.
+      @param blockIndex       the index of the atom block
+      @param forces           force array (forces added)
+      @param totalEnergy      total energy
+      --------------------------------------------------------------------------------------- 
+      @{
+      */
+    void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+    void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+    /** @} */
+    /**---------------------------------------------------------------------------------------
+      Calculate all the interactions for one atom block. Identical to function prototypes above but
+      with an extra template parameter to choose whether to use Ewald processing or not.
+      --------------------------------------------------------------------------------------- */
+    template<BlockType BLOCK_TYPE>
+    void calculateBlockIxnHandler(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+    /**
+    * Templatized implementation of calculateBlockIxn. It can handle both Ewald and non-ewald interactions
+    * through a template parameter since the code is so similar for the two cases. Note also that the
+    * floating-point SIMD type is also templated to allow any suitable type to be used.
+    */
+    template <int PERIODIC_TYPE, BlockType BLOCK_TYPE>
+    void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
+    /**
+     * Compute the displacement and squared distance between a collection of points, optionally using
+     * periodic boundary conditions.
+     */
+    template <int PERIODIC_TYPE>
+    void getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2, const fvec4& boxSize, const fvec4& invBoxSize) const;
+    /**
+     * Compute an approximation of a function using a table lookup.
+     **/
+      FVEC approximateFunctionFromTable(const std::vector<float>& table, FVEC x, FVEC inverse) const;
+};
+/**
+ * Use a table lookup to approximate a function specific function.
+ */
+template<typename FVEC>
+FVEC
+CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<float>& table,
+                                                          const FVEC x, const FVEC inverse) const {
+    // Compute the set of 8 index positions from which to gather the table data.
+    const auto x1 = x * inverse;
+    const auto index = min(floor(x1), float(NUM_TABLE_POINTS));
+    FVEC s1, s2;
+    gatherVecPair(table.data(), index, s1, s2);
+    const auto coeff2 = x1-FVEC(index);
+    const auto coeff1 = 1.0f-coeff2;
+    return coeff1*s1 + coeff2*s2;
+}
+template<typename FVEC>
+void CpuNonbondedForceFvec<FVEC>::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    calculateBlockIxnHandler<BlockType::NON_EWALD>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+}
+template<typename FVEC>
+void CpuNonbondedForceFvec<FVEC>::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    calculateBlockIxnHandler<BlockType::EWALD>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+}
+template<typename FVEC>
+template<BlockType BLOCK_TYPE>
+void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnHandler(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Determine whether we need to apply periodic boundary conditions.
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        using std::min;
+        using std::max;
+        const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < blockSize; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    if (periodicType == NoPeriodic)
+        calculateBlockIxnImpl<NoPeriodic, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockIxnImpl<PeriodicPerAtom, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockIxnImpl<PeriodicPerInteraction, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockIxnImpl<PeriodicTriclinic, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+}
+template<typename FVEC>
+template <int PERIODIC_TYPE, BlockType BLOCK_TYPE>
+void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
+    // Load the positions and parameters of the atoms in the block.
+    const int* blockAtom = &neighborList->getSortedAtoms()[blockSize * blockIndex];
+    fvec4 blockAtomPosq[blockSize];
+    FVEC blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    FVEC blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
+    for (int i = 0; i < blockSize; i++) {
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize; // :TODO: Apply one to blockAtom?
+    }
+    transpose(blockAtomPosq, blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
+    blockAtomCharge *= ONE_4PI_EPS0;
+    // Not the most efficient way to do this, but it works across all types we care about, and this isn't where
+    // the cycles are spent anyway.
+    FVEC blockAtomSigma = {};
+    FVEC blockAtomEpsilon = {};
+    for (int i=0; i<blockSize; ++i)
+    {
+        ((float*)&blockAtomSigma)[i] = atomParameters[blockAtom[i]].first;
+        ((float*)&blockAtomEpsilon)[i] = atomParameters[blockAtom[i]].second;
+    }
+    // Ewald needs C6 data gathered from a table. Unused variable for non-ewald.
+    const FVEC C6s = (BLOCK_TYPE == BlockType::EWALD) ? FVEC(C6params, blockAtom) : FVEC();
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    const FVEC cutoffDistanceSquared = cutoffDistance * cutoffDistance;
+    // Loop over neighbors for this block.
+    const auto& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        int atom = neighbors[i];
+        // Compute the distances to the block atoms.
+        FVEC dx, dy, dz, r2;
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, boxSize, invBoxSize);
+        const auto exclNotMask = FVEC::expandBitsToMask(~exclusions[i]);
+        const auto include = blendZero(r2 < cutoffDistance*cutoffDistance, exclNotMask);
+        if (!any(include))
+            continue; // No interactions to compute.
+        // Compute the interactions.
+        const auto inverseR = rsqrt(r2);
+        const auto r = r2*inverseR;
+        FVEC energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            const auto sig = blockAtomSigma+atomParameters[atom].first;
+            const auto sig2 = (inverseR*sig)*(inverseR*sig);
+            const auto sig6 = sig2*sig2*sig2;
+            const auto eps = blockAtomEpsilon*atomEpsilon;
+            const auto epsSig6 = eps*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                const auto t = blendZero((r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
+                const auto switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                const auto switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+            if (BLOCK_TYPE == BlockType::EWALD && ljpme) {
+                const auto C6ij = C6s*C6params[atom];
+                const auto inverseR2 = inverseR*inverseR;
+                const auto mysig2 = sig*sig;
+                const auto mysig6 = mysig2*mysig2*mysig2;
+                const auto emult = C6ij*inverseR2*inverseR2*inverseR2*approximateFunctionFromTable(exptermsTable, r, FVEC(exptermsDXInv));
+                const auto potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*approximateFunctionFromTable(dExptermsTable, r, FVEC(exptermsDXInv));
+                energy += emult + potentialShift;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        const auto chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (BLOCK_TYPE == BlockType::EWALD)
+        {
+            dEdR += chargeProd*inverseR*approximateFunctionFromTable(ewaldScaleTable, r, FVEC(ewaldDXInv));
+        }
+        else
+        {
+            if (cutoff)
+                dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+            else
+                dEdR += chargeProd*inverseR;
+        }
+        dEdR *= inverseR*inverseR;
+        // Accumulate energies.
+        if (totalEnergy) {
+            if (BLOCK_TYPE == BlockType::EWALD)
+                energy += chargeProd*inverseR*approximateFunctionFromTable(erfcTable, alphaEwald*r, FVEC(erfcDXInv));
+            else // Non-ewald.
+            {
+                if (cutoff)
+                    energy += chargeProd*(inverseR+krf*r2-crf);
+                else
+                    energy += chargeProd*inverseR;
+            }
+            energy = blendZero(energy, include);
+            *totalEnergy += reduceAdd(energy);
+        }
+        // Accumulate forces.
+        dEdR = blendZero(dEdR, include);
+        const auto fx = dx*dEdR;
+        const auto fy = dy*dEdR;
+        const auto fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* const atomForce = forces+4*atom;
+        const fvec4 newAtomForce = fvec4(atomForce) - reduceToVec3(fx, fy, fz);
+        newAtomForce.store(atomForce);
+    }
+    // Record the forces on the block atoms.
+    fvec4 f[blockSize];
+    transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f);
+    for (int j = 0; j < blockSize; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+}
+template<typename FVEC>
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceFvec<FVEC>::getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (PERIODIC_TYPE == PeriodicTriclinic) {
+        const auto scale3 = floor(dz*recipBoxSize[2]+0.5f);
+        dx -= scale3*periodicBoxVectors[2][0];
+        dy -= scale3*periodicBoxVectors[2][1];
+        dz -= scale3*periodicBoxVectors[2][2];
+        const auto scale2 = floor(dy*recipBoxSize[1]+0.5f);
+        dx -= scale2*periodicBoxVectors[1][0];
+        dy -= scale2*periodicBoxVectors[1][1];
+        const auto scale1 = floor(dx*recipBoxSize[0]+0.5f);
+        dx -= scale1*periodicBoxVectors[0][0];
+    }
+    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+} // namespace OpenMM
+#endif // OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
- * Contributors: Pande Group
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject
- * to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
-#define OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
-#include "CpuNonbondedForce.h"
-// ---------------------------------------------------------------------------------------
-namespace OpenMM {
-class CpuNonbondedForceVec4 : public CpuNonbondedForce {
-public:
-      /**---------------------------------------------------------------------------------------
-         Constructor
-         --------------------------------------------------------------------------------------- */
-       CpuNonbondedForceVec4();
-protected:
-      /**---------------------------------------------------------------------------------------
-         Calculate all the interactions for one atom block.
-         @param blockIndex       the index of the atom block
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-         --------------------------------------------------------------------------------------- */
-      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-      /**
-       * Templatized implementation of calculateBlockIxn.
-       */
-      template <int PERIODIC_TYPE>
-      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
-      /**---------------------------------------------------------------------------------------
-         Calculate all the interactions for one atom block.
-         @param blockIndex       the index of the atom block
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-         --------------------------------------------------------------------------------------- */
-      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-      /**
-       * Templatized implementation of calculateBlockEwaldIxn.
-       */
-      template <int PERIODIC_TYPE>
-      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
-      /**
-       * Compute the displacement and squared distance between a collection of points, optionally using
-       * periodic boundary conditions.
-       */
-      template <int PERIODIC_TYPE>
-      void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
-      /**
-       * Compute a fast approximation to erfc(x).
-       */
-      fvec4 erfcApprox(const fvec4& x);
-      /**
-       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-       */
-      fvec4 ewaldScaleFunction(const fvec4& x);
-      /**
-       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
-       * where dar = (dispersionAlpha * R)
-       * needed for LJPME energies.
-       */
-      fvec4 exptermsApprox(const fvec4& R);
-      /**
-       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
-       * where dar = (dispersionAlpha * R)
-       * needed for LJPME forces.
-       */
-      fvec4 dExptermsApprox(const fvec4& R);
-};
-} // namespace OpenMM
-// ---------------------------------------------------------------------------------------
-#endif // OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
- * Contributors: Pande Group
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject
- * to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
-#define OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
-#include "CpuNonbondedForce.h"
-#ifdef __AVX__
-#include "openmm/internal/vectorize8.h"
-// ---------------------------------------------------------------------------------------
-namespace OpenMM {
-class CpuNonbondedForceVec8 : public CpuNonbondedForce {
-public:
-       CpuNonbondedForceVec8();
-protected:            
-      /**---------------------------------------------------------------------------------------
-         Calculate all the interactions for one atom block.
-         @param blockIndex       the index of the atom block
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-         --------------------------------------------------------------------------------------- */
-      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-      /**
-       * Templatized implementation of calculateBlockIxn.
-       */
-      template <int PERIODIC_TYPE>
-      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
-      /**---------------------------------------------------------------------------------------
-         Calculate all the interactions for one atom block.
-         @param blockIndex       the index of the atom block
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-         --------------------------------------------------------------------------------------- */
-      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-      /**
-       * Templatized implementation of calculateBlockEwaldIxn.
-       */
-      template <int PERIODIC_TYPE>
-      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
-      /**
-       * Compute the displacement and squared distance between a collection of points, optionally using
-       * periodic boundary conditions.
-       */
-      template <int PERIODIC_TYPE>
-      void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
-      /**
-       * Compute a fast approximation to erfc(x).
-       */
-      fvec8 erfcApprox(const fvec8& x);
-      /**
-       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-       */
-      fvec8 ewaldScaleFunction(const fvec8& x);
-      /**
-       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
-       * where dar = (dispersionAlpha * R)
-       * needed for LJPME energies.
-       */
-      fvec8 exptermsApprox(const fvec8& R);
-      /**
-       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
-       * where dar = (dispersionAlpha * R)
-       * needed for LJPME forces.
-       */
-      fvec8 dExptermsApprox(const fvec8& R);
-};
-} // namespace OpenMM
-// ---------------------------------------------------------------------------------------
-#endif // __AVX__
-#endif // OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -472,16 +472,11 @@ private:
    int numParticles;
 };
-bool isVec8Supported();
+CpuNonbondedForce* createCpuNonbondedForceVec();
-CpuNonbondedForce* createCpuNonbondedForceVec4();
-CpuNonbondedForce* createCpuNonbondedForceVec8();
 CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
        data(data), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) {
-    if (isVec8Supported())
+    nonbonded = createCpuNonbondedForceVec();
-        nonbonded = createCpuNonbondedForceVec8();
-    else
-        nonbonded = createCpuNonbondedForceVec4();
 }
 CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {

--- a/platforms/cpu/src/CpuNonbondedForceFvec.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceFvec.cpp
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+ * Contributors: Daniel Towner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "CpuNonbondedForceFvec.h"
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec4();
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec8();
+bool isVec8Supported();
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec() {
+    if (isVec8Supported())
+        return createCpuNonbondedForceVec8();
+    else
+        return createCpuNonbondedForceVec4();
+}
+int getVecBlockSize() {
+    if (isVec8Supported())
+        return 8;
+    else
+        return 4;
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -22,440 +22,11 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
-#include "SimTKOpenMMUtilities.h"
+#include "CpuNonbondedForceFvec.h"
-#include "CpuNonbondedForceVec4.h"
-#include <algorithm>
-#include <iostream>
-using namespace std;
+// Very minimal file. It exists purely to be able to compile it in SIMD-4.
-using namespace OpenMM;
-/**
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec4()   {
- * Factory method to create a CpuNonbondedForceVec4.
+    return new OpenMM::CpuNonbondedForceFvec<fvec4>();
- */
-CpuNonbondedForce* createCpuNonbondedForceVec4() {
-    return new CpuNonbondedForceVec4();
-}
-/**---------------------------------------------------------------------------------------
-   CpuNonbondedForceVec4 constructor
-   --------------------------------------------------------------------------------------- */
-CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
-}
-enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
-void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Determine whether we need to apply periodic boundary conditions.
-    PeriodicType periodicType;
-    fvec4 blockCenter;
-    if (!periodic) {
-        periodicType = NoPeriodic;
-        blockCenter = 0.0f;
-    }
-    else {
-        const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
-        float minx, maxx, miny, maxy, minz, maxz;
-        minx = maxx = posq[4*blockAtom[0]];
-        miny = maxy = posq[4*blockAtom[0]+1];
-        minz = maxz = posq[4*blockAtom[0]+2];
-        for (int i = 1; i < 4; i++) {
-            minx = min(minx, posq[4*blockAtom[i]]);
-            maxx = max(maxx, posq[4*blockAtom[i]]);
-            miny = min(miny, posq[4*blockAtom[i]+1]);
-            maxy = max(maxy, posq[4*blockAtom[i]+1]);
-            minz = min(minz, posq[4*blockAtom[i]+2]);
-            maxz = max(maxz, posq[4*blockAtom[i]+2]);
-        }
-        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
-        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
-                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
-            periodicType = NoPeriodic;
-        else if (triclinic)
-            periodicType = PeriodicTriclinic;
-        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
-                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
-                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
-            periodicType = PeriodicPerAtom;
-        else
-            periodicType = PeriodicPerInteraction;
-    }
-    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
-    if (periodicType == NoPeriodic)
-        calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicPerAtom)
-        calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicPerInteraction)
-        calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicTriclinic)
-        calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-}
-template <int PERIODIC_TYPE>
-void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
-    // Load the positions and parameters of the atoms in the block.
-    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++) {
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        if (PERIODIC_TYPE == PeriodicPerAtom)
-            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
-    }
-    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
-    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
-    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
-    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    // Loop over neighbors for this block.
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        int atom = neighbors[i];
-        // Compute the distances to the block atoms.
-        fvec4 dx, dy, dz, r2;
-        fvec4 atomPos(posq+4*atom);
-        if (PERIODIC_TYPE == PeriodicPerAtom)
-            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
-        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        ivec4 include;
-        char excl = exclusions[i];
-        if (excl == 0)
-            include = -1;
-        else
-            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
-        include = include & (r2 < cutoffDistance*cutoffDistance);
-        if (!any(include))
-            continue; // No interactions to compute.
-        // Compute the interactions.
-        fvec4 inverseR = rsqrt(r2);
-        fvec4 energy, dEdR;
-        float atomEpsilon = atomParameters[atom].second;
-        if (atomEpsilon != 0.0f) {
-            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-            fvec4 sig2 = inverseR*sig;
-            sig2 *= sig2;
-            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
-            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-            energy = epsSig6*(sig6-1.0f);
-            if (useSwitch) {
-                fvec4 r = r2*inverseR;
-                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
-                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
-                dEdR = switchValue*dEdR - energy*switchDeriv*r;
-                energy *= switchValue;
-            }
-        }
-        else {
-            energy = 0.0f;
-            dEdR = 0.0f;
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        if (cutoff)
-            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
-        else
-            dEdR += chargeProd*inverseR;
-        dEdR *= inverseR*inverseR;
-        // Accumulate energies.
-        fvec4 one(1.0f);
-        if (totalEnergy) {
-            if (cutoff)
-                energy += chargeProd*(inverseR+krf*r2-crf);
-            else
-                energy += chargeProd*inverseR;
-            energy = blend(0.0f, energy, include);
-            *totalEnergy += dot4(energy, one);
-        }
-        // Accumulate forces.
-        dEdR = blend(0.0f, dEdR, include);
-        fvec4 fx = dx*dEdR;
-        fvec4 fy = dy*dEdR;
-        fvec4 fz = dz*dEdR;
-        blockAtomForceX += fx;
-        blockAtomForceY += fy;
-        blockAtomForceZ += fz;
-        float* atomForce = forces+4*atom;
-        atomForce[0] -= dot4(fx, one);
-        atomForce[1] -= dot4(fy, one);
-        atomForce[2] -= dot4(fz, one);
-    }
-    // Record the forces on the block atoms.
-    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
-    transpose(f[0], f[1], f[2], f[3]);
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
-  }
-void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Determine whether we need to apply periodic boundary conditions.
-    PeriodicType periodicType;
-    fvec4 blockCenter;
-    if (!periodic) {
-        periodicType = NoPeriodic;
-        blockCenter = 0.0f;
-    }
-    else {
-        const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
-        float minx, maxx, miny, maxy, minz, maxz;
-        minx = maxx = posq[4*blockAtom[0]];
-        miny = maxy = posq[4*blockAtom[0]+1];
-        minz = maxz = posq[4*blockAtom[0]+2];
-        for (int i = 1; i < 4; i++) {
-            minx = min(minx, posq[4*blockAtom[i]]);
-            maxx = max(maxx, posq[4*blockAtom[i]]);
-            miny = min(miny, posq[4*blockAtom[i]+1]);
-            maxy = max(maxy, posq[4*blockAtom[i]+1]);
-            minz = min(minz, posq[4*blockAtom[i]+2]);
-            maxz = max(maxz, posq[4*blockAtom[i]+2]);
-        }
-        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
-        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
-                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
-            periodicType = NoPeriodic;
-        else if (triclinic)
-            periodicType = PeriodicTriclinic;
-        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
-                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
-                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
-            periodicType = PeriodicPerAtom;
-        else
-            periodicType = PeriodicPerInteraction;
-    }
-    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
-    if (periodicType == NoPeriodic)
-        calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicPerAtom)
-        calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicPerInteraction)
-        calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicTriclinic)
-        calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-}
-template <int PERIODIC_TYPE>
-void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
-    // Load the positions and parameters of the atoms in the block.
-    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++) {
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        if (PERIODIC_TYPE == PeriodicPerAtom)
-            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
-    }
-    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
-    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
-    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
-    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
-    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    // Loop over neighbors for this block.
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        int atom = neighbors[i];
-        // Compute the distances to the block atoms.
-        fvec4 dx, dy, dz, r2;
-        fvec4 atomPos(posq+4*atom);
-        if (PERIODIC_TYPE == PeriodicPerAtom)
-            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
-        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        ivec4 include;
-        char excl = exclusions[i];
-        if (excl == 0)
-            include = -1;
-        else
-            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
-        include = include & (r2 < cutoffDistance*cutoffDistance);
-        if (!any(include))
-            continue; // No interactions to compute.
-        // Compute the interactions.
-        fvec4 inverseR = rsqrt(r2);
-        fvec4 r = r2*inverseR;
-        fvec4 energy, dEdR;
-        float atomEpsilon = atomParameters[atom].second;
-        if (atomEpsilon != 0.0f) {
-            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-            fvec4 sig2 = inverseR*sig;
-            sig2 *= sig2;
-            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 eps = blockAtomEpsilon*atomEpsilon;
-            fvec4 epsSig6 = eps*sig6;
-            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-            energy = epsSig6*(sig6-1.0f);
-            if (useSwitch) {
-                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
-                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
-                dEdR = switchValue*dEdR - energy*switchDeriv*r;
-                energy *= switchValue;
-            }
-            if (ljpme) {
-                fvec4 C6ij = C6s*C6params[atom];
-                fvec4 inverseR2 = inverseR*inverseR;
-                fvec4 mysig2 = sig*sig;
-                fvec4 mysig6 = mysig2*mysig2*mysig2;
-                fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
-                fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
-                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
-                energy += emult + potentialShift;
-            }
-        }
-        else {
-            energy = 0.0f;
-            dEdR = 0.0f;
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
-        dEdR *= inverseR*inverseR;        
-        // Accumulate energies.
-        fvec4 one(1.0f);
-        if (totalEnergy) {
-            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
-            energy = blend(0.0f, energy, include);
-            *totalEnergy += dot4(energy, one);
-        }
-        // Accumulate forces.
-        dEdR = blend(0.0f, dEdR, include);
-        fvec4 fx = dx*dEdR;
-        fvec4 fy = dy*dEdR;
-        fvec4 fz = dz*dEdR;
-        blockAtomForceX += fx;
-        blockAtomForceY += fy;
-        blockAtomForceZ += fz;
-        float* atomForce = forces+4*atom;
-        atomForce[0] -= dot4(fx, one);
-        atomForce[1] -= dot4(fy, one);
-        atomForce[2] -= dot4(fz, one);
-    }
-    // Record the forces on the block atoms.
-    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
-    transpose(f[0], f[1], f[2], f[3]);
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
-}
-template <int PERIODIC_TYPE>
-void CpuNonbondedForceVec4::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
-    dx = x-posI[0];
-    dy = y-posI[1];
-    dz = z-posI[2];
-    if (PERIODIC_TYPE == PeriodicTriclinic) {
-        fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
-        dx -= scale3*periodicBoxVectors[2][0];
-        dy -= scale3*periodicBoxVectors[2][1];
-        dz -= scale3*periodicBoxVectors[2][2];
-        fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
-        dx -= scale2*periodicBoxVectors[1][0];
-        dy -= scale2*periodicBoxVectors[1][1];
-        fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
-        dx -= scale1*periodicBoxVectors[0][0];
-    }
-    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
-        dx -= round(dx*invBoxSize[0])*boxSize[0];
-        dy -= round(dy*invBoxSize[1])*boxSize[1];
-        dz -= round(dz*invBoxSize[2])*boxSize[2];
-    }
-    r2 = dx*dx + dy*dy + dz*dz;
-}
-fvec4 CpuNonbondedForceVec4::erfcApprox(const fvec4& x) {
-    fvec4 x1 = x*erfcDXInv;
-    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = x1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&erfcTable[index[0]]);
-    fvec4 t2(&erfcTable[index[1]]);
-    fvec4 t3(&erfcTable[index[2]]);
-    fvec4 t4(&erfcTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
-}
-fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
-    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-    fvec4 x1 = x*ewaldDXInv;
-    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = x1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&ewaldScaleTable[index[0]]);
-    fvec4 t2(&ewaldScaleTable[index[1]]);
-    fvec4 t3(&ewaldScaleTable[index[2]]);
-    fvec4 t4(&ewaldScaleTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
-}
-fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
-    fvec4 r1 = r*exptermsDXInv;
-    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = r1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&exptermsTable[index[0]]);
-    fvec4 t2(&exptermsTable[index[1]]);
-    fvec4 t3(&exptermsTable[index[2]]);
-    fvec4 t4(&exptermsTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
-}
-fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
-    fvec4 r1 = r*exptermsDXInv;
-    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = r1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&dExptermsTable[index[0]]);
-    fvec4 t2(&dExptermsTable[index[1]]);
-    fvec4 t3(&dExptermsTable[index[2]]);
-    fvec4 t4(&dExptermsTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
 }
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -165,11 +165,14 @@ CpuPlatform::PlatformData::~PlatformData() {
        delete neighborList;
 }
-bool isVec8Supported();
+/**
+ * Return how much vectorisation is supported for host platform.
+ */
+int getVecBlockSize();
 void CpuPlatform::PlatformData::requestNeighborList(double cutoffDistance, double padding, bool useExclusions, const vector<set<int> >& exclusionList) {
    if (neighborList == NULL)
-        neighborList = new CpuNeighborList(isVec8Supported() ? 8 : 4);
+        neighborList = new CpuNeighborList(getVecBlockSize());
    if (cutoffDistance > cutoff)
        cutoff = cutoffDistance;
    if (cutoffDistance+padding > paddedCutoff)

--- a/tests/TestVectorize.cpp
+++ b/tests/TestVectorize.cpp
@@ -68,6 +68,14 @@ void testLoadStore() {
    ASSERT_EQUAL(i3[1], 3);
    ASSERT_EQUAL(i3[2], 4);
    ASSERT_EQUAL(i3[3], 5);
+    // Partial store of vec3 should not overwrite beyond the 3 elements.
+    float overwriteTest[4] = {9, 9, 9, 9};
+    f2.storeVec3(overwriteTest);
+    ASSERT_EQUAL(overwriteTest[0], f2[0]);
+    ASSERT_EQUAL(overwriteTest[1], f2[1]);
+    ASSERT_EQUAL(overwriteTest[2], f2[2]);
+    ASSERT_EQUAL(overwriteTest[3], 9);
 }
 void testArithmetic() {
@@ -160,15 +168,78 @@ void testMathFunctions() {
 }
 void testTranspose() {
-    fvec4 f1(1.0, 2.0, 3.0, 4.0);
+    fvec4 f[4] = {
-    fvec4 f2(5.0, 6.0, 7.0, 8.0);
+        {1.0, 2.0, 3.0, 4.0},
-    fvec4 f3(9.0, 10.0, 11.0, 12.0);
+        {5.0, 6.0, 7.0, 8.0},
-    fvec4 f4(13.0, 14.0, 15.0, 16.0);
+        {9.0, 10.0, 11.0, 12.0},
-    transpose(f1, f2, f3, f4);
+        {13.0, 14.0, 15.0, 16.0}
-    ASSERT_VEC4_EQUAL(f1, 1.0, 5.0, 9.0, 13.0);
+    };
-    ASSERT_VEC4_EQUAL(f2, 2.0, 6.0, 10.0, 14.0);
-    ASSERT_VEC4_EQUAL(f3, 3.0, 7.0, 11.0, 15.0);
+    // Out-of-place tranpose into specific variables. Done before in-place transpose test.
-    ASSERT_VEC4_EQUAL(f4, 4.0, 8.0, 12.0, 16.0);
+    fvec4 out0, out1, out2, out3;
+    transpose(f, out0, out1, out2, out3);
+    ASSERT_VEC4_EQUAL(out0, 1.0, 5.0, 9.0, 13.0);
+    ASSERT_VEC4_EQUAL(out1, 2.0, 6.0, 10.0, 14.0);
+    ASSERT_VEC4_EQUAL(out2, 3.0, 7.0, 11.0, 15.0);
+    ASSERT_VEC4_EQUAL(out3, 4.0, 8.0, 12.0, 16.0);
+    // In-place transpose. Done after the out-of-place transpose so avoid breaking that.
+    transpose(f[0], f[1], f[2], f[3]);
+    ASSERT_VEC4_EQUAL(f[0], 1.0, 5.0, 9.0, 13.0);
+    ASSERT_VEC4_EQUAL(f[1], 2.0, 6.0, 10.0, 14.0);
+    ASSERT_VEC4_EQUAL(f[2], 3.0, 7.0, 11.0, 15.0);
+    ASSERT_VEC4_EQUAL(f[3], 4.0, 8.0, 12.0, 16.0);
+    // Out-of-place transpose from named variables into an array.
+    fvec4 h[4];
+    fvec4 p0(0.1, 0.2, 0.3, 0.4);
+    fvec4 p1(0.5, 0.6, 0.7, 0.8);
+    fvec4 p2(0.9, 1.0, 1.1, 1.2);
+    fvec4 p3(1.3, 1.4, 1.5, 1.6);
+    transpose(p0, p1, p2, p3, h);
+    ASSERT_VEC4_EQUAL(h[0], 0.1, 0.5, 0.9, 1.3);
+    ASSERT_VEC4_EQUAL(h[1], 0.2, 0.6, 1.0, 1.4);
+    ASSERT_VEC4_EQUAL(h[2], 0.3, 0.7, 1.1, 1.5);
+    ASSERT_VEC4_EQUAL(h[3], 0.4, 0.8, 1.2, 1.6);
+}
+void testUtility() {
+    fvec4 f1(7, 2, -5, 13);
+    fvec4 f2(1, 2, 4, 7);
+    fvec4 f3(0.5, 1.0, 1.5, 2.0);
+    // Reduce-add across three vectors into a single vec3.
+    const auto computedVec3 = reduceToVec3(f1, f2, f3);
+    ASSERT_EQUAL(17, computedVec3[0]);
+    ASSERT_EQUAL(14, computedVec3[1]);
+    ASSERT_EQUAL(5,  computedVec3[2]);
+    // Gather values from a table. Variants for both one vector and two vector gathers are provided.
+    float table[2048];
+    for (int i=0; i<2048;++i)
+        table[i] = -i; // Same index to make it easy to debug, but negative to avoid copying idx.
+    // Single vector gather.
+    const int vidx[4] = {156, 1987, 33, 1003};
+    fvec4 g(table, vidx);
+    ASSERT_VEC4_EQUAL(g, -156, -1987, -33, -1003);
+    // Pair-wise vector gather.
+    fvec4 p0, p1;
+    gatherVecPair(table, ivec4(57, 105, 1976, 91), p0, p1);
+    ASSERT_VEC4_EQUAL(p0, -57, -105, -1976, -91);
+    ASSERT_VEC4_EQUAL(p1, -58, -106, -1977, -92);
+    // Verify building blend mask from integer. The mask isn't checked directly, as different platforms
+    // use different types of mask. Instead, check the side effect of using the mask in a blend.
+    const auto elements = fvec4(1, 2, 3, 4);
+    const auto maskZero = fvec4::expandBitsToMask(0);
+    ASSERT_VEC4_EQUAL_INT(blendZero(elements, maskZero), 0, 0, 0, 0);
+    const auto maskOne = fvec4::expandBitsToMask(0b1111);
+    ASSERT_VEC4_EQUAL_INT(blendZero(elements, maskOne), 1, 2, 3, 4);
+    const auto maskMix = fvec4::expandBitsToMask(0b1001);
+    ASSERT_VEC4_EQUAL_INT(blendZero(elements, maskMix), 1, 0, 0, 4);
 }
 int main(int argc, char* argv[]) {
@@ -183,6 +254,7 @@ int main(int argc, char* argv[]) {
        testComparisons();
        testMathFunctions();
        testTranspose();
+        testUtility();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/tests/TestVectorize8.cpp
+++ b/tests/TestVectorize8.cpp
@@ -100,6 +100,16 @@ void testLoadStore() {
    ASSERT_EQUAL(i3.upperVec()[1], 7);
    ASSERT_EQUAL(i3.upperVec()[2], 8);
    ASSERT_EQUAL(i3.upperVec()[3], 9);
+    // Partial store of vec3 should not overwrite beyond the 3 elements.
+    // Note that this is a fvec4 method, but is conditionally compiled for AVX so needs to be
+    // tested here too.
+    float overwriteTest[4] = {9, 9, 9, 9};
+    fvec4(1, 2, 3, 7777).storeVec3(overwriteTest);
+    ASSERT_EQUAL(overwriteTest[0], 1);
+    ASSERT_EQUAL(overwriteTest[1], 2);
+    ASSERT_EQUAL(overwriteTest[2], 3);
+    ASSERT_EQUAL(overwriteTest[3], 9);
 }
 void testArithmetic() {
@@ -185,32 +195,90 @@ void testMathFunctions() {
 }
 void testTranspose() {
-    fvec4 f1(0.0,   1.0,  2.0,  3.0);
+    fvec4 f[8] = {
-    fvec4 f2(10.0, 11.0, 12.0, 13.0);
+        {0.0,   1.0,  2.0,  3.0},
-    fvec4 f3(20.0, 21.0, 22.0, 23.0);
+        {10.0, 11.0, 12.0, 13.0},
-    fvec4 f4(30.0, 31.0, 32.0, 33.0);
+        {20.0, 21.0, 22.0, 23.0},
-    fvec4 f5(40.0, 41.0, 42.0, 43.0);
+        {30.0, 31.0, 32.0, 33.0},
-    fvec4 f6(50.0, 51.0, 52.0, 53.0);
+        {40.0, 41.0, 42.0, 43.0},
-    fvec4 f7(60.0, 61.0, 62.0, 63.0);
+        {50.0, 51.0, 52.0, 53.0},
-    fvec4 f8(70.0, 71.0, 72.0, 73.0);
+        {60.0, 61.0, 62.0, 63.0},
-    fvec8 o1, o2, o3, o4;
+        {70.0, 71.0, 72.0, 73.0}
+    };
-    transpose(f1, f2, f3, f4, f5, f6, f7, f8, o1, o2, o3, o4);
+    fvec8 o1, o2, o3, o4;
+    transpose(f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7], o1, o2, o3, o4);
    ASSERT_VEC8_EQUAL(o1, 0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0);
    ASSERT_VEC8_EQUAL(o2, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 61.0, 71.0);
    ASSERT_VEC8_EQUAL(o3, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 62.0, 72.0);
    ASSERT_VEC8_EQUAL(o4, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 63.0, 73.0);
-    fvec4 g1, g2, g3, g4, g5, g6, g7, g8;
+    fvec8 q1, q2, q3, q4;
-    transpose(o1, o2, o3, o4, g1, g2, g3, g4, g5, g6, g7, g8);
+    transpose(f, q1, q2, q3, q4);
-    ASSERT_VEC4_EQUAL(g1, 0.0,   1.0,  2.0,  3.0);
+    ASSERT_VEC8_EQUAL(q1, 0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0);
-    ASSERT_VEC4_EQUAL(g2, 10.0, 11.0, 12.0, 13.0);
+    ASSERT_VEC8_EQUAL(q2, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 61.0, 71.0);
-    ASSERT_VEC4_EQUAL(g3, 20.0, 21.0, 22.0, 23.0);
+    ASSERT_VEC8_EQUAL(q3, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 62.0, 72.0);
-    ASSERT_VEC4_EQUAL(g4, 30.0, 31.0, 32.0, 33.0);
+    ASSERT_VEC8_EQUAL(q4, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 63.0, 73.0);
-    ASSERT_VEC4_EQUAL(g5, 40.0, 41.0, 42.0, 43.0);
-    ASSERT_VEC4_EQUAL(g6, 50.0, 51.0, 52.0, 53.0);
+    fvec4 g[8];
-    ASSERT_VEC4_EQUAL(g7, 60.0, 61.0, 62.0, 63.0);
+    transpose(o1, o2, o3, o4, g[0], g[1], g[2], g[3], g[4], g[5], g[6], g[7]);
-    ASSERT_VEC4_EQUAL(g8, 70.0, 71.0, 72.0, 73.0);
+    ASSERT_VEC4_EQUAL(g[0], 0.0,   1.0,  2.0,  3.0);
+    ASSERT_VEC4_EQUAL(g[1], 10.0, 11.0, 12.0, 13.0);
+    ASSERT_VEC4_EQUAL(g[2], 20.0, 21.0, 22.0, 23.0);
+    ASSERT_VEC4_EQUAL(g[3], 30.0, 31.0, 32.0, 33.0);
+    ASSERT_VEC4_EQUAL(g[4], 40.0, 41.0, 42.0, 43.0);
+    ASSERT_VEC4_EQUAL(g[5], 50.0, 51.0, 52.0, 53.0);
+    ASSERT_VEC4_EQUAL(g[6], 60.0, 61.0, 62.0, 63.0);
+    ASSERT_VEC4_EQUAL(g[7], 70.0, 71.0, 72.0, 73.0);
+    fvec4 h[8];
+    transpose(o1, o2, o3, o4, h);
+    ASSERT_VEC4_EQUAL(h[0], 0.0,   1.0,  2.0,  3.0);
+    ASSERT_VEC4_EQUAL(h[1], 10.0, 11.0, 12.0, 13.0);
+    ASSERT_VEC4_EQUAL(h[2], 20.0, 21.0, 22.0, 23.0);
+    ASSERT_VEC4_EQUAL(h[3], 30.0, 31.0, 32.0, 33.0);
+    ASSERT_VEC4_EQUAL(h[4], 40.0, 41.0, 42.0, 43.0);
+    ASSERT_VEC4_EQUAL(h[5], 50.0, 51.0, 52.0, 53.0);
+    ASSERT_VEC4_EQUAL(h[6], 60.0, 61.0, 62.0, 63.0);
+    ASSERT_VEC4_EQUAL(h[7], 70.0, 71.0, 72.0, 73.0);
+}
+void testUtility() {
+    fvec8 f1(0.4, 1.9, -1.2, -3.8, 0.4, 1.9, -6.8, -3.8);
+    fvec8 f2(1, 2, 4, 7, 19, 31, 64, 5);
+    fvec8 f3(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
+    // Reduce-add across three vectors into a single vec3.
+    const auto computedVec3 = reduceToVec3(f1, f2, f3);
+    ASSERT_EQUAL(-11, computedVec3[0]);
+    ASSERT_EQUAL(133, computedVec3[1]);
+    ASSERT_EQUAL(18,  computedVec3[2]);
+    // Gather values from a table. Variants for both one vector and two vector gathers are provided.
+    float table[2048];
+    for (int i=0; i<2048;++i)
+        table[i] = -i; // Same index to make it easy to debug, but negative to avoid copying idx.
+    // Single vector gather.
+    const int vidx[8] = {4, 8, 156, 1987, 23, 65, 33, 1003};
+    fvec8 g(table, vidx);
+    ASSERT_VEC8_EQUAL(g, -4, -8, -156, -1987, -23, -65, -33, -1003);
+    // Pair-wise vector gather.
+    fvec8 p0, p1;
+    gatherVecPair(table, ivec8(57, 105, 1976, 91, 636, 1952, 345, 12), p0, p1);
+    ASSERT_VEC8_EQUAL(p0, -57, -105, -1976, -91, -636, -1952, -345, -12);
+    ASSERT_VEC8_EQUAL(p1, -58, -106, -1977, -92, -637, -1953, -346, -13);
+    // Verify building blend mask from integer. The mask isn't checked directly, as different platforms
+    // use different types of mask. Instead, check the side effect of using the mask in a blend.
+    const auto elements = fvec8(1, 2, 3, 4, 5, 6, 7, 8);
+    const auto maskZero = fvec8::expandBitsToMask(0);
+    ASSERT_VEC8_EQUAL_INT(blendZero(elements, maskZero), 0, 0, 0, 0, 0, 0, 0, 0);
+    const auto maskOne = fvec8::expandBitsToMask(0b11111111);
+    ASSERT_VEC8_EQUAL_INT(blendZero(elements, maskOne), 1, 2, 3, 4, 5, 6, 7, 8);
+    const auto maskMix = fvec8::expandBitsToMask(0b01101001);
+    ASSERT_VEC8_EQUAL_INT(blendZero(elements, maskMix), 1, 0, 0, 4, 0, 6, 7, 0);
 }
@@ -226,6 +294,7 @@ int main(int argc, char* argv[]) {
        testComparisons();
        testMathFunctions();
        testTranspose();
+        testUtility();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;