Conflict resolution in TestSplineFilter.cpp

0b5d58d7 · Charlles Abreu · 9026dbe7 · b0d13582 · 0b5d58d7 · 0b5d58d7
Commit 0b5d58d7 authored May 27, 2020 by Charlles Abreu
20 changed files
--- a/platforms/common/src/kernels/velocityVerlet.cc
+++ b/platforms/common/src/kernels/velocityVerlet.cc
 /**
- * Perform the first step of Velocity Verlet integration.
+ * Perform the first part of integration: velocity step.
 */
-
-KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL const mixed2* RESTRICT dt, GLOBAL const real4* RESTRICT posq,
-                                         GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta,
-                                         GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList
-#ifdef USE_MIXED_PRECISION
-                                        ,GLOBAL const real4* RESTRICT posqCorrection
-#endif
-    ){
-    const mixed2 stepSize = dt[0];
-    const mixed dtPos = stepSize.y;
-    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
-    const mixed scale = 0.5f * dtVel/(mixed) 0x100000000;
+KERNEL void integrateNoseHooverMiddlePart1(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force,
+        GLOBAL const mixed2* RESTRICT dt, GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList) {
+    mixed fscale = dt[0].y/(mixed) 0x100000000;
    int index = GLOBAL_ID;
    while (index < numAtoms) {
        int atom = atomList[index];
        mixed4 velocity = velm[atom];
        if (velocity.w != 0.0) {
-#ifdef USE_MIXED_PRECISION
-            real4 pos1 = posq[atom];
-            real4 pos2 = posqCorrection[atom];
-            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-            real4 pos = posq[atom];
-#endif
-            velocity.x += scale*force[atom]*velocity.w;
-            velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w;
-            velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w;
-            pos.x = velocity.x*dtPos;
-            pos.y = velocity.y*dtPos;
-            pos.z = velocity.z*dtPos;
-            posDelta[atom] = pos;
+            velocity.x += fscale*force[atom]*velocity.w;
+            velocity.y += fscale*force[atom+paddedNumAtoms]*velocity.w;
+            velocity.z += fscale*force[atom+paddedNumAtoms*2]*velocity.w;
            velm[atom] = velocity;
        }
        index += GLOBAL_SIZE;
@@ -58,12 +38,12 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN
        relVel.z= v2.z - v1.z;

        mixed3 comFrc;
-        mixed F1x = scale*force[atom1];
-        mixed F1y = scale*force[atom1+paddedNumAtoms];
-        mixed F1z = scale*force[atom1+paddedNumAtoms*2];
-        mixed F2x = scale*force[atom2];
-        mixed F2y = scale*force[atom2+paddedNumAtoms];
-        mixed F2z = scale*force[atom2+paddedNumAtoms*2];
+        mixed F1x = fscale*force[atom1];
+        mixed F1y = fscale*force[atom1+paddedNumAtoms];
+        mixed F1z = fscale*force[atom1+paddedNumAtoms*2];
+        mixed F2x = fscale*force[atom2];
+        mixed F2y = fscale*force[atom2+paddedNumAtoms];
+        mixed F2z = fscale*force[atom2+paddedNumAtoms*2];
        comFrc.x = F1x + F2x;
        comFrc.y = F1y + F2y;
        comFrc.z = F1z + F2z;
@@ -77,35 +57,16 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN
        relVel.x += relFrc.x * invRedMass;
        relVel.y += relFrc.y * invRedMass;
        relVel.z += relFrc.z * invRedMass;
-#ifdef USE_MIXED_PRECISION
-        real4 posv1 = posq[atom1];
-        real4 posv2 = posq[atom2];
-        real4 posc1 = posqCorrection[atom1];
-        real4 posc2 = posqCorrection[atom2];
-        mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
-        mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
-#else
-        real4 pos1 = posq[atom1];
-        real4 pos2 = posq[atom2];
-#endif
        if (v1.w != 0.0f) {
            v1.x = comVel.x - relVel.x*mass2fract;
            v1.y = comVel.y - relVel.y*mass2fract;
            v1.z = comVel.z - relVel.z*mass2fract;
-            pos1.x = v1.x*dtPos;
-            pos1.y = v1.y*dtPos;
-            pos1.z = v1.z*dtPos;
-            posDelta[atom1] = pos1;
            velm[atom1] = v1;
        }
        if (v2.w != 0.0f) {
            v2.x = comVel.x + relVel.x*mass1fract;
            v2.y = comVel.y + relVel.y*mass1fract;
            v2.z = comVel.z + relVel.z*mass1fract;
-            pos2.x = v2.x*dtPos;
-            pos2.y = v2.y*dtPos;
-            pos2.z = v2.z*dtPos;
-            posDelta[atom2] = pos2;
            velm[atom2] = v2;
        }
        index += GLOBAL_SIZE;
@@ -113,22 +74,60 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN
 }

 /**
- * Perform the second step of Velocity Verlet integration.
+ * Perform the second part of integration: position half step
+ */
+KERNEL void integrateNoseHooverMiddlePart2(int numAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta,
+        GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt) {
+    mixed halfdt = 0.5f*dt[0].y;
+    int index = GLOBAL_ID;
+    while (index < numAtoms) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
+            posDelta[index] = delta;
+            oldDelta[index] = delta;
+        }
+        index += GLOBAL_SIZE;
+    }
+}
+
+/**
+ * Perform the third part of integration: another position half step
 */
+KERNEL void integrateNoseHooverMiddlePart3(int numAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta,
+        GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt) {
+    mixed halfdt = 0.5f*dt[0].y;
+    int index = GLOBAL_ID;
+    while (index < numAtoms) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
+            posDelta[index] += delta;
+            oldDelta[index] += delta;
+        }
+        index += GLOBAL_SIZE;
+    }
+}

-KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm,
-                                         GLOBAL const mixed4* RESTRICT posDelta
+/**
+ * Perform the fourth part of integration: apply constraint forces to velocities, then record
+ * the constrained positions.
+ */
+KERNEL void integrateNoseHooverMiddlePart4(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm,
+         GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt
 #ifdef USE_MIXED_PRECISION
-                                        ,GLOBAL real4* RESTRICT posqCorrection
+        , GLOBAL real4* RESTRICT posqCorrection
 #endif
-    ){
-    mixed2 stepSize = dt[0];
-    int index = GLOBAL_ID;
-    if (index == 0)
-        dt[0].x = stepSize.y;
-    while(index < numAtoms) {
+        ) {
+    mixed invDt = 1/dt[0].y;
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
+            mixed4 delta = posDelta[index];
+            velocity.x += (delta.x-oldDelta[index].x)*invDt;
+            velocity.y += (delta.y-oldDelta[index].y)*invDt;
+            velocity.z += (delta.z-oldDelta[index].z)*invDt;
+            velm[index] = velocity;
 #ifdef USE_MIXED_PRECISION
            real4 pos1 = posq[index];
            real4 pos2 = posqCorrection[index];
@@ -136,7 +135,6 @@ KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT d
 #else
            real4 pos = posq[index];
 #endif
-            mixed4 delta = posDelta[index];
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
@@ -147,120 +145,10 @@ KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT d
            posq[index] = pos;
 #endif
        }
-        index += GLOBAL_SIZE;
    }
 }

-/**
- * Perform the third step of Velocity Verlet integration.
- */
-
-KERNEL void integrateVelocityVerletPart3(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
-                                         GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed4* RESTRICT posDelta,
-                                         GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList
-#ifdef USE_MIXED_PRECISION
-                                        ,GLOBAL const real4* RESTRICT posqCorrection
-#endif
-    ){
-    mixed2 stepSize = dt[0];
-#ifdef SUPPORTS_DOUBLE_PRECISION
-    double oneOverDt = 1.0/stepSize.y;
-#else
-    float oneOverDt = 1.0f/stepSize.y;
-    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
-#endif
-    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
-    const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
-    int index = GLOBAL_ID;
-    if (index == 0)
-        dt[0].x = stepSize.y;
-    while(index < numAtoms) {
-        int atom = atomList[index];
-        mixed4 velocity = velm[atom];
-        if (velocity.w != 0.0) {
-            mixed4 deltaXconstrained = posDelta[atom];
-            velocity.x += scale*force[atom]*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
-            velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
-            velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
-#ifndef SUPPORTS_DOUBLE_PRECISION
-            velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
-            velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
-            velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
-#endif
-            velm[atom] = velocity;
-        }
-        index += GLOBAL_SIZE;
-    }
-    index = GLOBAL_ID;
-    while(index < numPairs) {
-        int atom1 = pairList[index].x;
-        int atom2 = pairList[index].y;
-        mixed4 v1 = velm[atom1];
-        mixed4 v2 = velm[atom2];
-        mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
-        mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
-        mixed mass1fract = m1 / (m1 + m2);
-        mixed mass2fract = m2 / (m1 + m2);
-        mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
-        mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
-        mixed3 comVel;
-        comVel.x= v1.x*mass1fract + v2.x*mass2fract;
-        comVel.y= v1.y*mass1fract + v2.y*mass2fract;
-        comVel.z= v1.z*mass1fract + v2.z*mass2fract;
-        mixed3 relVel;
-        relVel.x= v2.x - v1.x;
-        relVel.y= v2.y - v1.y;
-        relVel.z= v2.z - v1.z;
-
-        mixed3 comFrc;
-        mixed F1x = scale*force[atom1];
-        mixed F1y = scale*force[atom1+paddedNumAtoms];
-        mixed F1z = scale*force[atom1+paddedNumAtoms*2];
-        mixed F2x = scale*force[atom2];
-        mixed F2y = scale*force[atom2+paddedNumAtoms];
-        mixed F2z = scale*force[atom2+paddedNumAtoms*2];
-        comFrc.x = F1x + F2x;
-        comFrc.y = F1y + F2y;
-        comFrc.z = F1z + F2z;
-        mixed3 relFrc;
-        relFrc.x = mass1fract*F2x - mass2fract*F1x;
-        relFrc.y = mass1fract*F2y - mass2fract*F1y;
-        relFrc.z = mass1fract*F2z - mass2fract*F1z;
-        comVel.x += comFrc.x * invTotMass;
-        comVel.y += comFrc.y * invTotMass;
-        comVel.z += comFrc.z * invTotMass;
-        relVel.x += relFrc.x * invRedMass;
-        relVel.y += relFrc.y * invRedMass;
-        relVel.z += relFrc.z * invRedMass;
-        if (v1.w != 0.0f) {
-            mixed4 deltaXconstrained = posDelta[atom1];
-            v1.x = comVel.x - relVel.x*mass2fract + (deltaXconstrained.x - v1.x*stepSize.y)*oneOverDt;
-            v1.y = comVel.y - relVel.y*mass2fract + (deltaXconstrained.y - v1.y*stepSize.y)*oneOverDt;
-            v1.z = comVel.z - relVel.z*mass2fract + (deltaXconstrained.z - v1.z*stepSize.y)*oneOverDt;
-#ifndef SUPPORTS_DOUBLE_PRECISION
-            v1.x += (deltaXconstrained.x - v1.x*stepSize.y)*correction;
-            v1.y += (deltaXconstrained.y - v1.y*stepSize.y)*correction;
-            v1.z += (deltaXconstrained.z - v1.z*stepSize.y)*correction;
-#endif
-            velm[atom1] = v1;
-        }
-        if (v2.w != 0.0f) {
-            mixed4 deltaXconstrained = posDelta[atom2];
-            v2.x = comVel.x + relVel.x*mass1fract + (deltaXconstrained.x - v2.x*stepSize.y)*oneOverDt;
-            v2.y = comVel.y + relVel.y*mass1fract + (deltaXconstrained.y - v2.y*stepSize.y)*oneOverDt;
-            v2.z = comVel.z + relVel.z*mass1fract + (deltaXconstrained.z - v2.z*stepSize.y)*oneOverDt;
-#ifndef SUPPORTS_DOUBLE_PRECISION
-            v2.x += (deltaXconstrained.x - v2.x*stepSize.y)*correction;
-            v2.y += (deltaXconstrained.y - v2.y*stepSize.y)*correction;
-            v2.z += (deltaXconstrained.z - v2.z*stepSize.y)*correction;
-#endif
-            velm[atom2] = v2;
-        }
-        index += GLOBAL_SIZE;
-    }
-}
-
-KERNEL void integrateVelocityVerletHardWall(int numPairs, GLOBAL const float* RESTRICT maxPairDistance, 
+KERNEL void integrateNoseHooverHardWall(int numPairs, GLOBAL const float* RESTRICT maxPairDistance, 
                                        GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
                                        GLOBAL mixed4* RESTRICT velm, GLOBAL const int2* RESTRICT pairList,
                                        GLOBAL const float* RESTRICT pairTemperature
@@ -370,4 +258,3 @@ KERNEL void integrateVelocityVerletHardWall(int numPairs, GLOBAL const float* RE
        }
    }
 }
-
--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -53,7 +53,15 @@ public:
    int getBlockSize() const;
    const std::vector<int>& getSortedAtoms() const;
    const std::vector<int>& getBlockNeighbors(int blockIndex) const;
-    const std::vector<char>& getBlockExclusions(int blockIndex) const;
+
+    /**
+     * Bitset for a single block, marking which indexes should be excluded. This data type needs to be big
+     * enough to store all the bits for any possible block size.
+     */
+    using BlockExclusionMask = int16_t;
+
+    const std::vector<BlockExclusionMask>& getBlockExclusions(int blockIndex) const;
+
    /**
     * This routine contains the code executed by each thread.
     */
@@ -64,7 +72,7 @@ private:
    std::vector<int> sortedAtoms;
    std::vector<float> sortedPositions;
    std::vector<std::vector<int> > blockNeighbors;
-    std::vector<std::vector<char> > blockExclusions;
+    std::vector<std::vector<BlockExclusionMask> > blockExclusions;
    // The following variables are used to make information accessible to the individual threads.
    float minx, maxx, miny, maxy, minz, maxz;
    std::vector<std::pair<int, int> > atomBins;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
--- a/platforms/cpu/include/CpuNonbondedForceFvec.h
+++ b/platforms/cpu/include/CpuNonbondedForceFvec.h
+
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+ * Contributors: Daniel Towner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
+#define OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
+
+#include "CpuNonbondedForce.h"
+#include "openmm/internal/vectorize.h"
+
+#include "SimTKOpenMMUtilities.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace OpenMM {
+
+enum BlockType {EWALD, NON_EWALD}; // :TODO: Better name for non-ewald.
+enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
+
+/**
+ * Generic SIMD implementation of CpuNonbondedForce. The templating allows the same
+ * basic code to be reused for any sort of SIMD type, including SSE, AVX, AVX2, or
+ * AVX-512.
+ */
+template<typename FVEC>
+class CpuNonbondedForceFvec : public CpuNonbondedForce {
+public:
+    /**
+     * Store how many elements are contained in each block of atoms.
+     */
+    static constexpr int blockSize = sizeof(FVEC) / sizeof(float);
+
+protected:
+    /**---------------------------------------------------------------------------------------
+      Calculate all the interactions for one atom block. These are part of the virtual function interface
+      and consequently have names which explicitly call Ewald variant or not.
+      They internally call into the generic handler function below.
+      @param blockIndex       the index of the atom block
+      @param forces           force array (forces added)
+      @param totalEnergy      total energy
+      --------------------------------------------------------------------------------------- 
+      @{
+      */
+    void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+    void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+    /** @} */
+
+    /**---------------------------------------------------------------------------------------
+      Calculate all the interactions for one atom block. Identical to function prototypes above but
+      with an extra template parameter to choose whether to use Ewald processing or not.
+      --------------------------------------------------------------------------------------- */
+    template<BlockType BLOCK_TYPE>
+    void calculateBlockIxnHandler(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+
+    /**
+    * Templatized implementation of calculateBlockIxn. It can handle both Ewald and non-ewald interactions
+    * through a template parameter since the code is so similar for the two cases. Note also that the
+    * floating-point SIMD type is also templated to allow any suitable type to be used.
+    */
+    template <int PERIODIC_TYPE, BlockType BLOCK_TYPE>
+    void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
+
+    /**
+     * Compute the displacement and squared distance between a collection of points, optionally using
+     * periodic boundary conditions.
+     */
+    template <int PERIODIC_TYPE>
+    void getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2, const fvec4& boxSize, const fvec4& invBoxSize) const;
+
+    /**
+     * Compute an approximation of a function using a table lookup.
+     **/
+      FVEC approximateFunctionFromTable(const std::vector<float>& table, FVEC x, FVEC inverse) const;
+
+};
+
+/**
+ * Use a table lookup to approximate a function specific function.
+ */
+template<typename FVEC>
+FVEC
+CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<float>& table,
+                                                          const FVEC x, const FVEC inverse) const {
+    // Compute the set of 8 index positions from which to gather the table data.
+    const auto x1 = x * inverse;
+    const auto index = min(floor(x1), float(NUM_TABLE_POINTS));
+
+    FVEC s1, s2;
+    gatherVecPair(table.data(), index, s1, s2);
+
+    const auto coeff2 = x1-FVEC(index);
+    const auto coeff1 = 1.0f-coeff2;
+
+    return coeff1*s1 + coeff2*s2;
+}
+
+template<typename FVEC>
+void CpuNonbondedForceFvec<FVEC>::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    calculateBlockIxnHandler<BlockType::NON_EWALD>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+}
+
+template<typename FVEC>
+void CpuNonbondedForceFvec<FVEC>::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    calculateBlockIxnHandler<BlockType::EWALD>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+}
+
+template<typename FVEC>
+template<BlockType BLOCK_TYPE>
+void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnHandler(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Determine whether we need to apply periodic boundary conditions.
+
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        using std::min;
+        using std::max;
+
+        const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < blockSize; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    if (periodicType == NoPeriodic)
+        calculateBlockIxnImpl<NoPeriodic, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockIxnImpl<PeriodicPerAtom, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockIxnImpl<PeriodicPerInteraction, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockIxnImpl<PeriodicTriclinic, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+}
+
+template<typename FVEC>
+template <int PERIODIC_TYPE, BlockType BLOCK_TYPE>
+void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
+    // Load the positions and parameters of the atoms in the block.
+
+    const int* blockAtom = &neighborList->getSortedAtoms()[blockSize * blockIndex];
+    fvec4 blockAtomPosq[blockSize];
+    FVEC blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    FVEC blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
+    for (int i = 0; i < blockSize; i++) {
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize; // :TODO: Apply one to blockAtom?
+    }
+
+    transpose(blockAtomPosq, blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
+    blockAtomCharge *= ONE_4PI_EPS0;
+
+    // Not the most efficient way to do this, but it works across all types we care about, and this isn't where
+    // the cycles are spent anyway.
+    FVEC blockAtomSigma = {};
+    FVEC blockAtomEpsilon = {};
+    for (int i=0; i<blockSize; ++i)
+    {
+        ((float*)&blockAtomSigma)[i] = atomParameters[blockAtom[i]].first;
+        ((float*)&blockAtomEpsilon)[i] = atomParameters[blockAtom[i]].second;
+    }
+
+    // Ewald needs C6 data gathered from a table. Unused variable for non-ewald.
+    const FVEC C6s = (BLOCK_TYPE == BlockType::EWALD) ? FVEC(C6params, blockAtom) : FVEC();
+
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    const FVEC cutoffDistanceSquared = cutoffDistance * cutoffDistance;
+
+    // Loop over neighbors for this block.
+    const auto& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        FVEC dx, dy, dz, r2;
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, boxSize, invBoxSize);
+
+        const auto exclNotMask = FVEC::expandBitsToMask(~exclusions[i]);
+        const auto include = blendZero(r2 < cutoffDistance*cutoffDistance, exclNotMask);
+        if (!any(include))
+            continue; // No interactions to compute.
+
+        // Compute the interactions.
+        const auto inverseR = rsqrt(r2);
+        const auto r = r2*inverseR;
+        FVEC energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            const auto sig = blockAtomSigma+atomParameters[atom].first;
+            const auto sig2 = (inverseR*sig)*(inverseR*sig);
+            const auto sig6 = sig2*sig2*sig2;
+            const auto eps = blockAtomEpsilon*atomEpsilon;
+            const auto epsSig6 = eps*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                const auto t = blendZero((r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
+                const auto switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                const auto switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+            if (BLOCK_TYPE == BlockType::EWALD && ljpme) {
+                const auto C6ij = C6s*C6params[atom];
+                const auto inverseR2 = inverseR*inverseR;
+                const auto mysig2 = sig*sig;
+                const auto mysig6 = mysig2*mysig2*mysig2;
+                const auto emult = C6ij*inverseR2*inverseR2*inverseR2*approximateFunctionFromTable(exptermsTable, r, FVEC(exptermsDXInv));
+                const auto potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*approximateFunctionFromTable(dExptermsTable, r, FVEC(exptermsDXInv));
+                energy += emult + potentialShift;
+            }
+
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        const auto chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (BLOCK_TYPE == BlockType::EWALD)
+        {
+            dEdR += chargeProd*inverseR*approximateFunctionFromTable(ewaldScaleTable, r, FVEC(ewaldDXInv));
+        }
+        else
+        {
+            if (cutoff)
+                dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+            else
+                dEdR += chargeProd*inverseR;
+        }
+        dEdR *= inverseR*inverseR;
+
+        // Accumulate energies.
+        if (totalEnergy) {
+            if (BLOCK_TYPE == BlockType::EWALD)
+                energy += chargeProd*inverseR*approximateFunctionFromTable(erfcTable, alphaEwald*r, FVEC(erfcDXInv));
+            else // Non-ewald.
+            {
+                if (cutoff)
+                    energy += chargeProd*(inverseR+krf*r2-crf);
+                else
+                    energy += chargeProd*inverseR;
+            }
+            energy = blendZero(energy, include);
+            *totalEnergy += reduceAdd(energy);
+        }
+
+        // Accumulate forces.
+        dEdR = blendZero(dEdR, include);
+        const auto fx = dx*dEdR;
+        const auto fy = dy*dEdR;
+        const auto fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+
+        float* const atomForce = forces+4*atom;
+        const fvec4 newAtomForce = fvec4(atomForce) - reduceToVec3(fx, fy, fz);
+        newAtomForce.store(atomForce);
+    }
+    
+    // Record the forces on the block atoms.
+    fvec4 f[blockSize];
+    transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f);
+    for (int j = 0; j < blockSize; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+}
+
+template<typename FVEC>
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceFvec<FVEC>::getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (PERIODIC_TYPE == PeriodicTriclinic) {
+        const auto scale3 = floor(dz*recipBoxSize[2]+0.5f);
+        dx -= scale3*periodicBoxVectors[2][0];
+        dy -= scale3*periodicBoxVectors[2][1];
+        dz -= scale3*periodicBoxVectors[2][2];
+        const auto scale2 = floor(dy*recipBoxSize[1]+0.5f);
+        dx -= scale2*periodicBoxVectors[1][0];
+        dy -= scale2*periodicBoxVectors[1][1];
+        const auto scale1 = floor(dx*recipBoxSize[0]+0.5f);
+        dx -= scale1*periodicBoxVectors[0][0];
+    }
+    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
+} // namespace OpenMM
+
+#endif // OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
-
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
- * Contributors: Pande Group
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject
- * to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
-#define OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
-
-#include "CpuNonbondedForce.h"
-// ---------------------------------------------------------------------------------------
-
-namespace OpenMM {
-
-class CpuNonbondedForceVec4 : public CpuNonbondedForce {
-public:
-      /**---------------------------------------------------------------------------------------
-      
-         Constructor
-      
-         --------------------------------------------------------------------------------------- */
-
-       CpuNonbondedForceVec4();
-
-protected:
-      /**---------------------------------------------------------------------------------------
-      
-         Calculate all the interactions for one atom block.
-      
-         @param blockIndex       the index of the atom block
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-            
-         --------------------------------------------------------------------------------------- */
-          
-      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-
-      /**
-       * Templatized implementation of calculateBlockIxn.
-       */
-      template <int PERIODIC_TYPE>
-      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
-            
-      /**---------------------------------------------------------------------------------------
-      
-         Calculate all the interactions for one atom block.
-      
-         @param blockIndex       the index of the atom block
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-            
-         --------------------------------------------------------------------------------------- */
-          
-      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-
-      /**
-       * Templatized implementation of calculateBlockEwaldIxn.
-       */
-      template <int PERIODIC_TYPE>
-      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
-
-      /**
-       * Compute the displacement and squared distance between a collection of points, optionally using
-       * periodic boundary conditions.
-       */
-      template <int PERIODIC_TYPE>
-      void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
-
-      /**
-       * Compute a fast approximation to erfc(x).
-       */
-      fvec4 erfcApprox(const fvec4& x);
-
-      /**
-       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-       */
-      fvec4 ewaldScaleFunction(const fvec4& x);
-
-      /**
-       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
-       * where dar = (dispersionAlpha * R)
-       * needed for LJPME energies.
-       */
-      fvec4 exptermsApprox(const fvec4& R);
-
-      /**
-       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
-       * where dar = (dispersionAlpha * R)
-       * needed for LJPME forces.
-       */
-      fvec4 dExptermsApprox(const fvec4& R);
-};
-
-} // namespace OpenMM
-
-// ---------------------------------------------------------------------------------------
-
-#endif // OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
-
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
- * Contributors: Pande Group
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject
- * to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
-#define OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
-
-#include "CpuNonbondedForce.h"
-
-#ifdef __AVX__
-
-#include "openmm/internal/vectorize8.h"
-
-// ---------------------------------------------------------------------------------------
-
-namespace OpenMM {
-
-class CpuNonbondedForceVec8 : public CpuNonbondedForce {
-public:
-       CpuNonbondedForceVec8();
-
-protected:            
-      /**---------------------------------------------------------------------------------------
-      
-         Calculate all the interactions for one atom block.
-      
-         @param blockIndex       the index of the atom block
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-            
-         --------------------------------------------------------------------------------------- */
-          
-      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-      
-      /**
-       * Templatized implementation of calculateBlockIxn.
-       */
-      template <int PERIODIC_TYPE>
-      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
-            
-      /**---------------------------------------------------------------------------------------
-      
-         Calculate all the interactions for one atom block.
-      
-         @param blockIndex       the index of the atom block
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-            
-         --------------------------------------------------------------------------------------- */
-          
-      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-
-      /**
-       * Templatized implementation of calculateBlockEwaldIxn.
-       */
-      template <int PERIODIC_TYPE>
-      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
-
-      /**
-       * Compute the displacement and squared distance between a collection of points, optionally using
-       * periodic boundary conditions.
-       */
-      template <int PERIODIC_TYPE>
-      void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
-
-      /**
-       * Compute a fast approximation to erfc(x).
-       */
-      fvec8 erfcApprox(const fvec8& x);
-      
-      /**
-       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-       */
-      fvec8 ewaldScaleFunction(const fvec8& x);
-
-      /**
-       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
-       * where dar = (dispersionAlpha * R)
-       * needed for LJPME energies.
-       */
-      fvec8 exptermsApprox(const fvec8& R);
-
-      /**
-       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
-       * where dar = (dispersionAlpha * R)
-       * needed for LJPME forces.
-       */
-      fvec8 dExptermsApprox(const fvec8& R);
-
-};
-
-} // namespace OpenMM
-
-// ---------------------------------------------------------------------------------------
-
-#endif // __AVX__
-
-#endif // OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
--- a/platforms/cpu/src/CpuCustomGBForce.cpp
+++ b/platforms/cpu/src/CpuCustomGBForce.cpp
@@ -363,7 +363,7 @@ void CpuCustomGBForce::calculateParticlePairValue(int index, ThreadData& data, i
            const int blockSize = neighborList->getBlockSize();
            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-            const vector<char>& blockExclusions = neighborList->getBlockExclusions(blockIndex);
+            const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {
                int first = neighbors[i];
                for (int k = 0; k < blockSize; k++) {
@@ -458,7 +458,7 @@ void CpuCustomGBForce::calculateParticlePairEnergyTerm(int index, ThreadData& da
            const int blockSize = neighborList->getBlockSize();
            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-            const vector<char>& blockExclusions = neighborList->getBlockExclusions(blockIndex);
+            const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {
                int first = neighbors[i];
                for (int k = 0; k < blockSize; k++) {
@@ -545,7 +545,7 @@ void CpuCustomGBForce::calculateChainRuleForces(ThreadData& data, int numAtoms,
            const int blockSize = neighborList->getBlockSize();
            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-            const vector<char>& blockExclusions = neighborList->getBlockExclusions(blockIndex);
+            const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {
                int first = neighbors[i];
                for (int k = 0; k < blockSize; k++) {

--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp
@@ -110,7 +110,7 @@ void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, vector<
        neighborList->computeNeighborList(numParticles, posq, exclusions, periodicBoxVectors, usePeriodic, cutoffDistance, threads);
        for (int blockIndex = 0; blockIndex < neighborList->getNumBlocks(); blockIndex++) {
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-            const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+            const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
            int numNeighbors = neighbors.size();
            for (int i = 0; i < 4; i++) {
                int p1 = neighborList->getSortedAtoms()[4*blockIndex+i];

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
@@ -195,7 +195,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
            const int blockSize = neighborList->getBlockSize();
            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-            const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+            const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {
                int first = neighbors[i];
                for (int j = 0; j < (int) paramNames.size(); j++)

--- a/platforms/cpu/src/CpuGayBerneForce.cpp
+++ b/platforms/cpu/src/CpuGayBerneForce.cpp
@@ -183,7 +183,7 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
            const int blockSize = neighborList->getBlockSize();
            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-            const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+            const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {
                int first = neighbors[i];
                if (particles[first].sqrtEpsilon == 0.0f)

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -472,16 +472,11 @@ private:
    int numParticles;
 };

-bool isVec8Supported();
-CpuNonbondedForce* createCpuNonbondedForceVec4();
-CpuNonbondedForce* createCpuNonbondedForceVec8();
+CpuNonbondedForce* createCpuNonbondedForceVec();

 CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
        data(data), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) {
-    if (isVec8Supported())
-        nonbonded = createCpuNonbondedForceVec8();
-    else
-        nonbonded = createCpuNonbondedForceVec4();
+    nonbonded = createCpuNonbondedForceVec();
 }

 CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -164,7 +164,7 @@ public:
        return VoxelIndex(y, z);
    }
        
-    void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const vector<float>& blockAtomX, const vector<float>& blockAtomY, const vector<float>& blockAtomZ, const vector<float>& sortedPositions, const vector<VoxelIndex>& atomVoxelIndex) const {
+    void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<CpuNeighborList::BlockExclusionMask>& exclusions, float maxDistance, const vector<int>& blockAtoms, const vector<float>& blockAtomX, const vector<float>& blockAtomY, const vector<float>& blockAtomZ, const vector<float>& sortedPositions, const vector<VoxelIndex>& atomVoxelIndex) const {
        neighbors.resize(0);
        exclusions.resize(0);
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
@@ -484,10 +484,10 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    
    int numPadding = numBlocks*blockSize-numAtoms;
    if (numPadding > 0) {
-        char mask = ((0xFFFF-(1<<blockSize)+1) >> numPadding);
+        const BlockExclusionMask mask = (~0) << (blockSize - numPadding);
        for (int i = 0; i < numPadding; i++)
            sortedAtoms.push_back(0);
-        vector<char>& exc = blockExclusions[blockExclusions.size()-1];
+        auto& exc = blockExclusions[blockExclusions.size()-1];
        for (int i = 0; i < (int) exc.size(); i++)
            exc[i] |= mask;
    }
@@ -509,7 +509,7 @@ const std::vector<int>& CpuNeighborList::getBlockNeighbors(int blockIndex) const
    return blockNeighbors[blockIndex];
 }

-const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) const {
+const std::vector<CpuNeighborList::BlockExclusionMask>& CpuNeighborList::getBlockExclusions(int blockIndex) const {
    return blockExclusions[blockIndex];
    
 }
@@ -573,12 +573,12 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI

        // Record the exclusions for this block.

-        map<int, char> atomFlags;
+        map<int, BlockExclusionMask> atomFlags;
        for (int j = 0; j < atomsInBlock; j++) {
            const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
-            char mask = 1<<j;
+            const BlockExclusionMask mask = 1<<j;
            for (int exclusion : atomExclusions) {
-                map<int, char>::iterator thisAtomFlags = atomFlags.find(exclusion);
+                const auto thisAtomFlags = atomFlags.find(exclusion);
                if (thisAtomFlags == atomFlags.end())
                    atomFlags[exclusion] = mask;
                else
@@ -588,7 +588,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
        int numNeighbors = blockNeighbors[i].size();
        for (int k = 0; k < numNeighbors; k++) {
            int atomIndex = blockNeighbors[i][k];
-            map<int, char>::iterator thisAtomFlags = atomFlags.find(atomIndex);
+            auto thisAtomFlags = atomFlags.find(atomIndex);
            if (thisAtomFlags != atomFlags.end())
                blockExclusions[i][k] |= thisAtomFlags->second;
        }

--- a/platforms/cpu/src/CpuNonbondedForceFvec.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceFvec.cpp
+
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+ * Contributors: Daniel Towner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "CpuNonbondedForceFvec.h"
+
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec4();
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec8();
+
+bool isVec8Supported();
+
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec() {
+    if (isVec8Supported())
+        return createCpuNonbondedForceVec8();
+    else
+        return createCpuNonbondedForceVec4();
+}
+
+int getVecBlockSize() {
+    if (isVec8Supported())
+        return 8;
+    else
+        return 4;
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -22,440 +22,11 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include "SimTKOpenMMUtilities.h"
-#include "CpuNonbondedForceVec4.h"
-#include <algorithm>
-#include <iostream>
+#include "CpuNonbondedForceFvec.h"

-using namespace std;
-using namespace OpenMM;
+// Very minimal file. It exists purely to be able to compile it in SIMD-4.

-/**
- * Factory method to create a CpuNonbondedForceVec4.
- */
-CpuNonbondedForce* createCpuNonbondedForceVec4() {
-    return new CpuNonbondedForceVec4();
-}
-
-/**---------------------------------------------------------------------------------------
-
-   CpuNonbondedForceVec4 constructor
-
-   --------------------------------------------------------------------------------------- */
-
-CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
-}
-
-enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
-
-void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Determine whether we need to apply periodic boundary conditions.
-    
-    PeriodicType periodicType;
-    fvec4 blockCenter;
-    if (!periodic) {
-        periodicType = NoPeriodic;
-        blockCenter = 0.0f;
-    }
-    else {
-        const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
-        float minx, maxx, miny, maxy, minz, maxz;
-        minx = maxx = posq[4*blockAtom[0]];
-        miny = maxy = posq[4*blockAtom[0]+1];
-        minz = maxz = posq[4*blockAtom[0]+2];
-        for (int i = 1; i < 4; i++) {
-            minx = min(minx, posq[4*blockAtom[i]]);
-            maxx = max(maxx, posq[4*blockAtom[i]]);
-            miny = min(miny, posq[4*blockAtom[i]+1]);
-            maxy = max(maxy, posq[4*blockAtom[i]+1]);
-            minz = min(minz, posq[4*blockAtom[i]+2]);
-            maxz = max(maxz, posq[4*blockAtom[i]+2]);
-        }
-        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
-        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
-                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
-            periodicType = NoPeriodic;
-        else if (triclinic)
-            periodicType = PeriodicTriclinic;
-        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
-                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
-                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
-            periodicType = PeriodicPerAtom;
-        else
-            periodicType = PeriodicPerInteraction;
-    }
-    
-    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
-    
-    if (periodicType == NoPeriodic)
-        calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicPerAtom)
-        calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicPerInteraction)
-        calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicTriclinic)
-        calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-}
-
-template <int PERIODIC_TYPE>
-void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
-    // Load the positions and parameters of the atoms in the block.
-    
-    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++) {
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        if (PERIODIC_TYPE == PeriodicPerAtom)
-            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
-    }
-    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
-    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
-    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
-    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    
-    // Loop over neighbors for this block.
-    
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        
-        int atom = neighbors[i];
-        
-        // Compute the distances to the block atoms.
-        
-        fvec4 dx, dy, dz, r2;
-        fvec4 atomPos(posq+4*atom);
-        if (PERIODIC_TYPE == PeriodicPerAtom)
-            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
-        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        ivec4 include;
-        char excl = exclusions[i];
-        if (excl == 0)
-            include = -1;
-        else
-            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
-        include = include & (r2 < cutoffDistance*cutoffDistance);
-        if (!any(include))
-            continue; // No interactions to compute.
-        
-        // Compute the interactions.
-        
-        fvec4 inverseR = rsqrt(r2);
-        fvec4 energy, dEdR;
-        float atomEpsilon = atomParameters[atom].second;
-        if (atomEpsilon != 0.0f) {
-            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-            fvec4 sig2 = inverseR*sig;
-            sig2 *= sig2;
-            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
-            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-            energy = epsSig6*(sig6-1.0f);
-            if (useSwitch) {
-                fvec4 r = r2*inverseR;
-                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
-                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
-                dEdR = switchValue*dEdR - energy*switchDeriv*r;
-                energy *= switchValue;
-            }
-        }
-        else {
-            energy = 0.0f;
-            dEdR = 0.0f;
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        if (cutoff)
-            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
-        else
-            dEdR += chargeProd*inverseR;
-        dEdR *= inverseR*inverseR;
-
-        // Accumulate energies.
-
-        fvec4 one(1.0f);
-        if (totalEnergy) {
-            if (cutoff)
-                energy += chargeProd*(inverseR+krf*r2-crf);
-            else
-                energy += chargeProd*inverseR;
-            energy = blend(0.0f, energy, include);
-            *totalEnergy += dot4(energy, one);
-        }
-
-        // Accumulate forces.
-
-        dEdR = blend(0.0f, dEdR, include);
-        fvec4 fx = dx*dEdR;
-        fvec4 fy = dy*dEdR;
-        fvec4 fz = dz*dEdR;
-        blockAtomForceX += fx;
-        blockAtomForceY += fy;
-        blockAtomForceZ += fz;
-        float* atomForce = forces+4*atom;
-        atomForce[0] -= dot4(fx, one);
-        atomForce[1] -= dot4(fy, one);
-        atomForce[2] -= dot4(fz, one);
-    }
-    
-    // Record the forces on the block atoms.
-
-    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
-    transpose(f[0], f[1], f[2], f[3]);
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
-  }
-
-void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Determine whether we need to apply periodic boundary conditions.
-    PeriodicType periodicType;
-    fvec4 blockCenter;
-    if (!periodic) {
-        periodicType = NoPeriodic;
-        blockCenter = 0.0f;
-    }
-    else {
-        const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
-        float minx, maxx, miny, maxy, minz, maxz;
-        minx = maxx = posq[4*blockAtom[0]];
-        miny = maxy = posq[4*blockAtom[0]+1];
-        minz = maxz = posq[4*blockAtom[0]+2];
-        for (int i = 1; i < 4; i++) {
-            minx = min(minx, posq[4*blockAtom[i]]);
-            maxx = max(maxx, posq[4*blockAtom[i]]);
-            miny = min(miny, posq[4*blockAtom[i]+1]);
-            maxy = max(maxy, posq[4*blockAtom[i]+1]);
-            minz = min(minz, posq[4*blockAtom[i]+2]);
-            maxz = max(maxz, posq[4*blockAtom[i]+2]);
-        }
-        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
-        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
-                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
-            periodicType = NoPeriodic;
-        else if (triclinic)
-            periodicType = PeriodicTriclinic;
-        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
-                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
-                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
-            periodicType = PeriodicPerAtom;
-        else
-            periodicType = PeriodicPerInteraction;
-    }
-    
-    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
-    
-    if (periodicType == NoPeriodic)
-        calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicPerAtom)
-        calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicPerInteraction)
-        calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-    else if (periodicType == PeriodicTriclinic)
-        calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
-}
-
-template <int PERIODIC_TYPE>
-void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
-    // Load the positions and parameters of the atoms in the block.
-    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++) {
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        if (PERIODIC_TYPE == PeriodicPerAtom)
-            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
-    }
-    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
-    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
-    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
-    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
-    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-
-    // Loop over neighbors for this block.
-    
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        
-        int atom = neighbors[i];
-        
-        // Compute the distances to the block atoms.
-        
-        fvec4 dx, dy, dz, r2;
-        fvec4 atomPos(posq+4*atom);
-        if (PERIODIC_TYPE == PeriodicPerAtom)
-            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
-        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        ivec4 include;
-        char excl = exclusions[i];
-        if (excl == 0)
-            include = -1;
-        else
-            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
-        include = include & (r2 < cutoffDistance*cutoffDistance);
-        if (!any(include))
-            continue; // No interactions to compute.
-        
-        // Compute the interactions.
-        
-        fvec4 inverseR = rsqrt(r2);
-        fvec4 r = r2*inverseR;
-        fvec4 energy, dEdR;
-        float atomEpsilon = atomParameters[atom].second;
-        if (atomEpsilon != 0.0f) {
-            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-            fvec4 sig2 = inverseR*sig;
-            sig2 *= sig2;
-            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 eps = blockAtomEpsilon*atomEpsilon;
-            fvec4 epsSig6 = eps*sig6;
-            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-            energy = epsSig6*(sig6-1.0f);
-            if (useSwitch) {
-                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
-                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
-                dEdR = switchValue*dEdR - energy*switchDeriv*r;
-                energy *= switchValue;
-            }
-
-            if (ljpme) {
-                fvec4 C6ij = C6s*C6params[atom];
-                fvec4 inverseR2 = inverseR*inverseR;
-                fvec4 mysig2 = sig*sig;
-                fvec4 mysig6 = mysig2*mysig2*mysig2;
-                fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
-                fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
-                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
-                energy += emult + potentialShift;
-            }
-        }
-        else {
-            energy = 0.0f;
-            dEdR = 0.0f;
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
-        dEdR *= inverseR*inverseR;        
-
-        // Accumulate energies.
-
-        fvec4 one(1.0f);
-        if (totalEnergy) {
-            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
-            energy = blend(0.0f, energy, include);
-            *totalEnergy += dot4(energy, one);
-        }
-
-        // Accumulate forces.
-
-        dEdR = blend(0.0f, dEdR, include);
-        fvec4 fx = dx*dEdR;
-        fvec4 fy = dy*dEdR;
-        fvec4 fz = dz*dEdR;
-        blockAtomForceX += fx;
-        blockAtomForceY += fy;
-        blockAtomForceZ += fz;
-        float* atomForce = forces+4*atom;
-        atomForce[0] -= dot4(fx, one);
-        atomForce[1] -= dot4(fy, one);
-        atomForce[2] -= dot4(fz, one);
-    }
-    
-    // Record the forces on the block atoms.
-
-    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
-    transpose(f[0], f[1], f[2], f[3]);
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
-}
-
-template <int PERIODIC_TYPE>
-void CpuNonbondedForceVec4::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
-    dx = x-posI[0];
-    dy = y-posI[1];
-    dz = z-posI[2];
-    if (PERIODIC_TYPE == PeriodicTriclinic) {
-        fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
-        dx -= scale3*periodicBoxVectors[2][0];
-        dy -= scale3*periodicBoxVectors[2][1];
-        dz -= scale3*periodicBoxVectors[2][2];
-        fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
-        dx -= scale2*periodicBoxVectors[1][0];
-        dy -= scale2*periodicBoxVectors[1][1];
-        fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
-        dx -= scale1*periodicBoxVectors[0][0];
-    }
-    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
-        dx -= round(dx*invBoxSize[0])*boxSize[0];
-        dy -= round(dy*invBoxSize[1])*boxSize[1];
-        dz -= round(dz*invBoxSize[2])*boxSize[2];
-    }
-    r2 = dx*dx + dy*dy + dz*dz;
-}
-
-fvec4 CpuNonbondedForceVec4::erfcApprox(const fvec4& x) {
-    fvec4 x1 = x*erfcDXInv;
-    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = x1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&erfcTable[index[0]]);
-    fvec4 t2(&erfcTable[index[1]]);
-    fvec4 t3(&erfcTable[index[2]]);
-    fvec4 t4(&erfcTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
-}
-
-fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
-    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-
-    fvec4 x1 = x*ewaldDXInv;
-    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = x1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&ewaldScaleTable[index[0]]);
-    fvec4 t2(&ewaldScaleTable[index[1]]);
-    fvec4 t3(&ewaldScaleTable[index[2]]);
-    fvec4 t4(&ewaldScaleTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
-}
-
-fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
-    fvec4 r1 = r*exptermsDXInv;
-    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = r1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&exptermsTable[index[0]]);
-    fvec4 t2(&exptermsTable[index[1]]);
-    fvec4 t3(&exptermsTable[index[2]]);
-    fvec4 t4(&exptermsTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
-}
-
-fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
-    fvec4 r1 = r*exptermsDXInv;
-    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = r1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&dExptermsTable[index[0]]);
-    fvec4 t2(&dExptermsTable[index[1]]);
-    fvec4 t3(&dExptermsTable[index[2]]);
-    fvec4 t4(&dExptermsTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec4()   {
+    return new OpenMM::CpuNonbondedForceFvec<fvec4>();
 }

--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -165,11 +165,14 @@ CpuPlatform::PlatformData::~PlatformData() {
        delete neighborList;
 }

-bool isVec8Supported();
+/**
+ * Return how much vectorisation is supported for host platform.
+ */
+int getVecBlockSize();

 void CpuPlatform::PlatformData::requestNeighborList(double cutoffDistance, double padding, bool useExclusions, const vector<set<int> >& exclusionList) {
    if (neighborList == NULL)
-        neighborList = new CpuNeighborList(isVec8Supported() ? 8 : 4);
+        neighborList = new CpuNeighborList(getVecBlockSize());
    if (cutoffDistance > cutoff)
        cutoff = cutoffDistance;
    if (cutoffDistance+padding > paddedCutoff)

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -133,10 +133,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CommonIntegrateCustomStepKernel(name, platform, cu);
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CommonApplyAndersenThermostatKernel(name, platform, cu);
-    if (name == NoseHooverChainKernel::Name())
-        return new CommonNoseHooverChainKernel(name, platform, cu);
-    if (name == IntegrateVelocityVerletStepKernel::Name())
-        return new CommonIntegrateVelocityVerletStepKernel(name, platform, cu);
+    if (name == IntegrateNoseHooverStepKernel::Name())
+        return new CommonIntegrateNoseHooverStepKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
    if (name == RemoveCMMotionKernel::Name())

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -96,7 +96,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateVelocityVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateNoseHooverStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
@@ -104,7 +104,6 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
-    registerKernelFactory(NoseHooverChainKernel::Name(), factory);
    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
    platformProperties.push_back(CudaDeviceIndex());

--- a/platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
+++ b/platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2019 Stanford University and the Authors.           *
- * Authors: Andreas Krämer and Andrew C. Simmonett                            *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include "CudaTests.h"
-#include "TestNoseHooverThermostat.h"
-
-void runPlatformTests() {
-}
--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -131,10 +131,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
        return new CommonIntegrateCustomStepKernel(name, platform, cl);
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CommonApplyAndersenThermostatKernel(name, platform, cl);
-    if (name == NoseHooverChainKernel::Name())
-        return new CommonNoseHooverChainKernel(name, platform, cl);
-    if (name == IntegrateVelocityVerletStepKernel::Name())
-        return new CommonIntegrateVelocityVerletStepKernel(name, platform, cl);
+    if (name == IntegrateNoseHooverStepKernel::Name())
+        return new CommonIntegrateNoseHooverStepKernel(name, platform, cl);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new OpenCLApplyMonteCarloBarostatKernel(name, platform, cl);
    if (name == RemoveCMMotionKernel::Name())