Vectorized CpuCustomNonbondedForce (#3568)

* Began vectorizing CustomNonbondedForce * Refactored CpuCustomNonbondedForce to support multiple vector sizes * AVX implementation of CpuCustomNonbondedForce * Fixed compilation errors

Vectorized CpuCustomNonbondedForce (#3568)
* Began vectorizing CustomNonbondedForce * Refactored CpuCustomNonbondedForce to support multiple vector sizes * AVX implementation of CpuCustomNonbondedForce * Fixed compilation errors
c8981916 · Peter Eastman · GitHub · fe21d5ee · c8981916 · c8981916
Unverified Commit c8981916 authored Apr 14, 2022 by Peter Eastman Committed by GitHub Apr 14, 2022
14 changed files
--- a/libraries/lepton/src/CompiledVectorExpression.cpp
+++ b/libraries/lepton/src/CompiledVectorExpression.cpp
@@ -580,7 +580,7 @@ void CompiledVectorExpression::generateJitCode() {
    CodeHolder code;
    code.init(runtime.environment());
    x86::Compiler c(&code);
-    FuncNode* funcNode = c.addFunc(FuncSignatureT<double>());
+    FuncNode* funcNode = c.addFunc(FuncSignatureT<void>());
    funcNode->frame().setAvxEnabled();
    vector<x86::Ymm> workspaceVar(workspace.size()/width);
    for (int i = 0; i < (int) workspaceVar.size(); i++)

--- a/openmmapi/include/openmm/internal/hardware.h
+++ b/openmmapi/include/openmm/internal/hardware.h
@@ -36,9 +36,6 @@
 * This file defines a collection of functions for querying the specific hardware being used.
 */

-/**
- * Get the number of CPU cores available.
- */
 #ifdef __APPLE__
   #include <sys/sysctl.h>
   #include <dlfcn.h>
@@ -56,6 +53,9 @@
   #endif
 #endif

+/**
+ * Get the number of CPU cores available.
+ */
 static int getNumProcessors() {
 #ifdef __APPLE__
    int ncpu;
@@ -95,7 +95,7 @@ static int getNumProcessors() {
 #else
 #if !defined(__ANDROID__) && !defined(__PNACL__) && !defined(__PPC__) \
    && !defined(__ARM__) && !defined(__ARM64__)
-    static void cpuid(int cpuInfo[4], int infoType){
+    static void cpuid(int cpuInfo[4], int infoType) {
    #ifdef __LP64__
        __asm__ __volatile__ (
            "cpuid":
@@ -119,7 +119,33 @@ static int getNumProcessors() {
        );
    #endif
    }
-    #endif
+#else
+    static void cpuid(int cpuInfo[4], int infoType) {
+        cpuInfo[0] = cpuInfo[1] = cpuInfo[2] = 0;
+    }
+#endif
 #endif

+/**
+ * Get whether this is an x86 CPU that supports AVX.
+ */
+static bool isAvxSupported() {
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+    if (cpuInfo[0] >= 1) {
+        cpuid(cpuInfo, 1);
+        return ((cpuInfo[2] & ((int) 1 << 28)) != 0);
+    }
+    return false;
+}
+
+/**
+ * Get the maximum supported size for vectors in multiples of four bytes.  This
+ * is the number of int or float values that can be contained in a vector.
+ */
+static int getVectorWidth() {
+    if (isAvxSupported())
+        return 8;
+    return 4;
+}
 #endif // OPENMM_HARDWARE_H_
--- a/platforms/cpu/include/CpuCustomNonbondedForce.h
+++ b/platforms/cpu/include/CpuCustomNonbondedForce.h
@@ -30,6 +30,8 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
+#include "lepton/CompiledVectorExpression.h"
+#include "lepton/ParsedExpression.h"
 #include <atomic>
 #include <map>
 #include <set>
@@ -37,21 +39,21 @@
 #include <vector>

 namespace OpenMM {
-    
-class CpuCustomNonbondedForce {
-   public:

+class CpuCustomNonbondedForce {
+public:
      /**---------------------------------------------------------------------------------------

         Constructor

         --------------------------------------------------------------------------------------- */

-       CpuCustomNonbondedForce(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledExpression& forceExpression,
-                               const std::vector<std::string>& parameterNames, const std::vector<std::set<int> >& exclusions,
-                               const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions,
-                               const std::vector<std::string>& computedValueNames, const std::vector<Lepton::CompiledExpression> computedValueExpressions,
-                               ThreadPool& threads);
+       CpuCustomNonbondedForce(ThreadPool& threads);
+
+       void initialize(const Lepton::ParsedExpression& energyExpression, const Lepton::ParsedExpression& forceExpression,
+                       const std::vector<std::string>& parameterNames, const std::vector<std::set<int> >& exclusions,
+                       const std::vector<Lepton::ParsedExpression> energyParamDerivExpressions,
+                       const std::vector<std::string>& computedValueNames, const std::vector<Lepton::ParsedExpression> computedValueExpressions);

      /**---------------------------------------------------------------------------------------

@@ -59,7 +61,7 @@ class CpuCustomNonbondedForce {

         --------------------------------------------------------------------------------------- */

-       ~CpuCustomNonbondedForce();
+       virtual ~CpuCustomNonbondedForce();

      /**---------------------------------------------------------------------------------------

@@ -123,7 +125,7 @@ class CpuCustomNonbondedForce {
    void calculatePairIxn(int numberOfAtoms, float* posq, std::vector<OpenMM::Vec3>& atomCoordinates, std::vector<std::vector<double> >& atomParameters,
                          const std::map<std::string, double>& globalParameters, std::vector<AlignedArray<float> >& threadForce,
                          bool includeForce, bool includeEnergy, double& totalEnergy, double* energyParamDerivs);
-private:
+protected:
    class ThreadData;

    bool cutoff;
@@ -137,7 +139,7 @@ private:
    AlignedArray<fvec4> periodicBoxVec4;
    double cutoffDistance, switchingDistance;
    ThreadPool& threads;
-    const std::vector<std::set<int> > exclusions;
+    std::vector<std::set<int> > exclusions;
    std::vector<ThreadData*> threadData;
    std::vector<std::string> paramNames, computedValueNames;
    std::vector<std::pair<int, int> > groupInteractions;
@@ -167,10 +169,22 @@ private:
     * @param forces           force array (forces added)
     * @param totalEnergy      total energy
     * @param boxSize          the size of the periodic box
-     * @param boxSize          the inverse size of the periodic box
+     * @param invBoxSize       the inverse size of the periodic box
     */
    void calculateOneIxn(int atom1, int atom2, ThreadData& data, float* forces, double& totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);

+    /**
+     * Calculate all the interactions for one block of atoms.
+     * 
+     * @param data            workspace for the current thread
+     * @param blockIndex      the index of the atom block
+     * @param forces          force array (forces added)
+     * @param totalEnergy     total energy
+     * @param boxSize         the size of the periodic box
+     * @param invBoxSize       the inverse size of the periodic box
+     */
+    virtual void calculateBlockIxn(ThreadData& data, int blockIndex, float* forces, double& totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;
+
    /**
     * Compute the displacement and squared distance between two points, optionally using
     * periodic boundary conditions.
@@ -180,19 +194,26 @@ private:

 class CpuCustomNonbondedForce::ThreadData {
 public:
-    ThreadData(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledExpression& forceExpression, const std::vector<std::string>& parameterNames,
+    ThreadData(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledVectorExpression& energyVecExpression,
+            const Lepton::CompiledExpression& forceExpression, const Lepton::CompiledVectorExpression& forceVecExpression, const std::vector<std::string>& parameterNames,
            const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions, const std::vector<std::string>& computedValueNames,
            const std::vector<Lepton::CompiledExpression> computedValueExpressions, std::vector<std::vector<double> >& atomComputedValues);
-    Lepton::CompiledExpression energyExpression;
-    Lepton::CompiledExpression forceExpression;
+    Lepton::CompiledExpression energyExpression, forceExpression;
+    Lepton::CompiledVectorExpression energyVecExpression, forceVecExpression;
    std::vector<Lepton::CompiledExpression> computedValueExpressions, energyParamDerivExpressions;
    CompiledExpressionSet expressionSet;
    std::vector<double> particleParam, computedValues;
+    std::vector<float> rvec, vecParticle1Params, vecParticle2Params, vecParticle1Values, vecParticle2Values;
    double r;
    std::vector<double> energyParamDerivs; 
    std::vector<std::vector<double> >& atomComputedValues;
 };

+/**
+ * This function is called to create an instance of an appropriate subclass for the current CPU.
+ */
+CpuCustomNonbondedForce* createCpuCustomNonbondedForce(ThreadPool& threads);
+
 } // namespace OpenMM

 #endif // OPENMM_CPU_CUSTOM_NONBONDED_FORCE_H__
--- a/platforms/cpu/include/CpuCustomNonbondedForceFvec.h
+++ b/platforms/cpu/include/CpuCustomNonbondedForceFvec.h
+
+/* Portions copyright (c) 2009-2022 Stanford University and Simbios.
+ * Contributors: Peter Eastman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef OPENMM_CPU_CUSTOM_NONBONDED_FORCE_FVEC_H__
+#define OPENMM_CPU_CUSTOM_NONBONDED_FORCE_FVEC_H__
+
+#include "CpuCustomNonbondedForce.h"
+
+namespace OpenMM {
+
+enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
+
+template <typename FVEC, int BLOCK_SIZE>
+class CpuCustomNonbondedForceFvec : public CpuCustomNonbondedForce {
+public:
+    CpuCustomNonbondedForceFvec(ThreadPool& threads) : CpuCustomNonbondedForce(threads) {
+    }
+
+protected:
+    /**
+     * Calculate all the interactions for one block of atoms.
+     * 
+     * @param data            workspace for the current thread
+     * @param blockIndex      the index of the atom block
+     * @param forces          force array (forces added)
+     * @param totalEnergy     total energy
+     * @param boxSize         the size of the periodic box
+     * @param invBoxSize       the inverse size of the periodic box
+     */
+    void calculateBlockIxn(ThreadData& data, int blockIndex, float* forces, double& totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+
+    template <int PERIODIC_TYPE>
+    void calculateBlockIxnImpl(ThreadData& data, int blockIndex, float* forces, double& totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
+
+    /**
+     * Compute the displacement and squared distance between a collection of points, optionally using
+     * periodic boundary conditions.
+     */
+    template <int PERIODIC_TYPE>
+    void getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2, const fvec4& boxSize, const fvec4& invBoxSize) const;
+};
+
+template<typename FVEC, int BLOCK_SIZE>
+void CpuCustomNonbondedForceFvec<FVEC, BLOCK_SIZE>::calculateBlockIxn(ThreadData& data, int blockIndex, float* forces, double& totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Determine whether we need to apply periodic boundary conditions.
+
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int32_t* blockAtom = &neighborList->getSortedAtoms()[BLOCK_SIZE*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < BLOCK_SIZE; i++) {
+            minx = std::min(minx, posq[4*blockAtom[i]]);
+            maxx = std::max(maxx, posq[4*blockAtom[i]]);
+            miny = std::min(miny, posq[4*blockAtom[i]+1]);
+            maxy = std::max(maxy, posq[4*blockAtom[i]+1]);
+            minz = std::min(minz, posq[4*blockAtom[i]+2]);
+            maxz = std::max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+
+    if (periodicType == NoPeriodic)
+        calculateBlockIxnImpl<NoPeriodic>(data, blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockIxnImpl<PeriodicPerAtom>(data, blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockIxnImpl<PeriodicPerInteraction>(data, blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockIxnImpl<PeriodicTriclinic>(data, blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+}
+
+template<typename FVEC, int BLOCK_SIZE>
+template <int PERIODIC_TYPE>
+void CpuCustomNonbondedForceFvec<FVEC, BLOCK_SIZE>::calculateBlockIxnImpl(ThreadData& data, int blockIndex, float* forces, double& totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
+    // Load the positions and parameters of the atoms in the block.
+
+    const int32_t* blockAtom = &neighborList->getSortedAtoms()[BLOCK_SIZE*blockIndex];
+    fvec4 blockAtomPosq[BLOCK_SIZE];
+    FVEC blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    FVEC blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
+    int numParams = paramNames.size();
+    int numComputed = computedValueNames.size();
+    for (int i = 0; i < BLOCK_SIZE; i++) {
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+        for (int j = 0; j < numParams; j++)
+            data.vecParticle1Params[j*BLOCK_SIZE+i] = atomParameters[blockAtom[i]][j];
+        for (int j = 0; j < numComputed; j++)
+            data.vecParticle1Values[j*BLOCK_SIZE+i] = atomComputedValues[j][blockAtom[i]];
+    }
+    transpose(blockAtomPosq, blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    const FVEC cutoffDistanceSquared = cutoffDistance * cutoffDistance;
+
+    // Loop over neighbors for this block.
+
+    const auto& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
+    FVEC partialEnergy = {};
+    for (int i = 0; i < neighbors.size(); i++) {
+        // Load the next neighbor.
+
+        int atom = neighbors[i];
+        for (int j = 0; j < numParams; j++)
+            for (int k = 0; k < BLOCK_SIZE; k++)
+                data.vecParticle2Params[j*BLOCK_SIZE+k] = atomParameters[atom][j];
+        for (int j = 0; j < numComputed; j++)
+            for (int k = 0; k < BLOCK_SIZE; k++)
+                data.vecParticle2Values[j*BLOCK_SIZE+k] = atomComputedValues[j][atom];
+
+        // Compute the distances to the block atoms.
+
+        FVEC dx, dy, dz, r2;
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, boxSize, invBoxSize);
+        const auto exclNotMask = FVEC::expandBitsToMask(~exclusions[i]);
+        const auto include = blendZero(r2 < cutoffDistanceSquared, exclNotMask);
+        if (!any(include))
+            continue; // No interactions to compute.
+
+        // Compute the interactions.
+
+        const auto inverseR = rsqrt(r2);
+        const auto r = r2*inverseR;
+        r.store(data.rvec.data());
+        FVEC dEdR(data.forceVecExpression.evaluate());
+        FVEC energy;
+        if (includeEnergy)
+            energy = FVEC(data.energyVecExpression.evaluate());
+        if (useSwitch) {
+            const auto t = blendZero((r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
+            const auto switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+            const auto switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+            dEdR = switchValue*dEdR + energy*switchDeriv;
+            energy *= switchValue;
+        }
+        dEdR *= inverseR;
+
+        // Accumulate forces and energies.
+
+        if (includeEnergy) {
+            energy = blendZero(energy, include);
+            partialEnergy += energy;
+        }
+        dEdR = blendZero(dEdR, include);
+        const auto fx = dx*dEdR;
+        const auto fy = dy*dEdR;
+        const auto fz = dz*dEdR;
+        blockAtomForceX -= fx;
+        blockAtomForceY -= fy;
+        blockAtomForceZ -= fz;
+        float* const atomForce = forces+4*atom;
+        const fvec4 newAtomForce = fvec4(atomForce) + reduceToVec3(fx, fy, fz);
+        newAtomForce.store(atomForce);
+    }
+    if (includeEnergy)
+        totalEnergy += reduceAdd(partialEnergy);
+
+    // Record the forces on the block atoms.
+
+    fvec4 f[BLOCK_SIZE];
+    transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f);
+    for (int j = 0; j < BLOCK_SIZE; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+}
+
+template<typename FVEC, int BLOCK_SIZE>
+template <int PERIODIC_TYPE>
+void CpuCustomNonbondedForceFvec<FVEC, BLOCK_SIZE>::getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (PERIODIC_TYPE == PeriodicTriclinic) {
+        const auto scale3 = floor(dz*recipBoxSize[2]+0.5f);
+        dx -= scale3*periodicBoxVectors[2][0];
+        dy -= scale3*periodicBoxVectors[2][1];
+        dz -= scale3*periodicBoxVectors[2][2];
+        const auto scale2 = floor(dy*recipBoxSize[1]+0.5f);
+        dx -= scale2*periodicBoxVectors[1][0];
+        dy -= scale2*periodicBoxVectors[1][1];
+        const auto scale1 = floor(dx*recipBoxSize[0]+0.5f);
+        dx -= scale1*periodicBoxVectors[0][0];
+    }
+    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
+} // namespace OpenMM
+
+#endif // OPENMM_CPU_CUSTOM_NONBONDED_FORCE_FVEC_H__
--- a/platforms/cpu/include/CpuNonbondedForceFvec.h
+++ b/platforms/cpu/include/CpuNonbondedForceFvec.h
@@ -207,7 +207,6 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
    // Ewald needs C6 data gathered from a table. Unused variable for non-ewald.
    const FVEC C6s = (BLOCK_TYPE == BlockType::EWALD) ? FVEC(C6params, blockAtom) : FVEC();

-    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    const FVEC cutoffDistanceSquared = cutoffDistance * cutoffDistance;


--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
@@ -8,9 +8,11 @@ ENDFOREACH(file)
 IF(MSVC)
    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX2 /D__AVX2__")
+    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuCustomNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
 ELSEIF(X86)
    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx")
    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx2 -mfma")
+    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuCustomNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx")
 ENDIF()

 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
@@ -22,21 +22,21 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include <string.h>
-#include <sstream>
-
-#include "SimTKOpenMMUtilities.h"
-#include "ReferenceForce.h"
 #include "CpuCustomNonbondedForce.h"
+#include "openmm/internal/hardware.h"
+#include <cmath>

 using namespace OpenMM;
+using namespace Lepton;
 using namespace std;

-CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledExpression& forceExpression,
-            const vector<string>& parameterNames, const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions,
-            const vector<string>& computedValueNames, const vector<Lepton::CompiledExpression> computedValueExpressions,
+CpuCustomNonbondedForce::ThreadData::ThreadData(const CompiledExpression& energyExpression, const CompiledVectorExpression& energyVecExpression,
+            const CompiledExpression& forceExpression, const CompiledVectorExpression& forceVecExpression,
+            const vector<string>& parameterNames, const std::vector<CompiledExpression> energyParamDerivExpressions,
+            const vector<string>& computedValueNames, const vector<CompiledExpression> computedValueExpressions,
            vector<vector<double> >& atomComputedValues) :
-            energyExpression(energyExpression), forceExpression(forceExpression), energyParamDerivExpressions(energyParamDerivExpressions),
+            energyExpression(energyExpression), energyVecExpression(energyVecExpression), forceExpression(forceExpression),
+            forceVecExpression(forceVecExpression), energyParamDerivExpressions(energyParamDerivExpressions),
            computedValueExpressions(computedValueExpressions), atomComputedValues(atomComputedValues) {
    // Prepare for passing variables to expressions.

@@ -44,19 +44,13 @@ CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression
    variableLocations["r"] = &r;
    particleParam.resize(2*parameterNames.size());
    computedValues.resize(2*computedValueNames.size());
-    for (int i = 0; i < (int) parameterNames.size(); i++) {
-        for (int j = 0; j < 2; j++) {
-            stringstream name;
-            name << parameterNames[i] << (j+1);
-            variableLocations[name.str()] = &particleParam[i*2+j];
-        }
+    for (int i = 0; i < parameterNames.size(); i++) {
+        variableLocations[parameterNames[i]+"1"] = &particleParam[i*2];
+        variableLocations[parameterNames[i]+"2"] = &particleParam[i*2+1];
    }
-    for (int i = 0; i < (int) computedValueNames.size(); i++) {
-        for (int j = 0; j < 2; j++) {
-            stringstream name;
-            name << computedValueNames[i] << (j+1);
-            variableLocations[name.str()] = &computedValues[i*2+j];
-        }
+    for (int i = 0; i < computedValueNames.size(); i++) {
+        variableLocations[computedValueNames[i]+"1"] = &computedValues[i*2];
+        variableLocations[computedValueNames[i]+"2"] = &computedValues[i*2+1];
    }
    energyParamDerivs.resize(energyParamDerivExpressions.size());
    this->energyExpression.setVariableLocations(variableLocations);
@@ -68,6 +62,27 @@ CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression
        expressionSet.registerExpression(expression);
    }

+    // Prepare for passing variables to vectorized expressions.
+
+    map<string, float*> vecVariableLocations;
+    int blockSize = getVectorWidth();
+    rvec.resize(blockSize);
+    vecParticle1Params.resize(blockSize*parameterNames.size());
+    vecParticle2Params.resize(blockSize*parameterNames.size());
+    vecParticle1Values.resize(blockSize*computedValueNames.size());
+    vecParticle2Values.resize(blockSize*computedValueNames.size());
+    vecVariableLocations["r"] = rvec.data();
+    for (int i = 0; i < parameterNames.size(); i++) {
+        vecVariableLocations[parameterNames[i]+"1"] = &vecParticle1Params[i*blockSize];
+        vecVariableLocations[parameterNames[i]+"2"] = &vecParticle2Params[i*blockSize];
+    }
+    for (int i = 0; i < computedValueNames.size(); i++) {
+        vecVariableLocations[computedValueNames[i]+"1"] = &vecParticle1Values[i*blockSize];
+        vecVariableLocations[computedValueNames[i]+"2"] = &vecParticle2Values[i*blockSize];
+    }
+    this->energyVecExpression.setVariableLocations(vecVariableLocations);
+    this->forceVecExpression.setVariableLocations(vecVariableLocations);
+
    // Prepare for passing variables to the computed value expressions.

    map<string, double*> valueVariableLocations;
@@ -79,14 +94,28 @@ CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression
    }
 }

-CpuCustomNonbondedForce::CpuCustomNonbondedForce(const Lepton::CompiledExpression& energyExpression,
-            const Lepton::CompiledExpression& forceExpression, const vector<string>& parameterNames, const vector<set<int> >& exclusions,
-            const vector<Lepton::CompiledExpression> energyParamDerivExpressions, const vector<string>& computedValueNames,
-            const vector<Lepton::CompiledExpression> computedValueExpressions, ThreadPool& threads) :
-            cutoff(false), useSwitch(false), periodic(false), useInteractionGroups(false), paramNames(parameterNames), exclusions(exclusions),
-            computedValueNames(computedValueNames), threads(threads) {
+CpuCustomNonbondedForce::CpuCustomNonbondedForce(ThreadPool& threads) : cutoff(false), useSwitch(false), periodic(false), useInteractionGroups(false), threads(threads) {
+}
+
+void CpuCustomNonbondedForce::initialize(const ParsedExpression& energyExpression,
+            const ParsedExpression& forceExpression, const vector<string>& parameterNames, const vector<set<int> >& exclusions,
+            const vector<ParsedExpression> energyParamDerivExpressions, const vector<string>& computedValueNames,
+            const vector<ParsedExpression> computedValueExpressions) {
+    this->paramNames = parameterNames;
+    this->exclusions = exclusions;
+    this->computedValueNames = computedValueNames;
+    CompiledExpression compiledEnergyExpression = energyExpression.createCompiledExpression();
+    CompiledExpression compiledForceExpression = forceExpression.createCompiledExpression();
+    CompiledVectorExpression energyVecExpression = energyExpression.createCompiledVectorExpression(getVectorWidth());
+    CompiledVectorExpression forceVecExpression = forceExpression.createCompiledVectorExpression(getVectorWidth());
+    vector<CompiledExpression> compiledDerivExpressions, compiledValueExpressions;
+    for (auto& exp : energyParamDerivExpressions)
+        compiledDerivExpressions.push_back(exp.createCompiledExpression());
+    for (auto& exp : computedValueExpressions)
+        compiledValueExpressions.push_back(exp.createCompiledExpression());
    for (int i = 0; i < threads.getNumThreads(); i++)
-        threadData.push_back(new ThreadData(energyExpression, forceExpression, parameterNames, energyParamDerivExpressions, computedValueNames, computedValueExpressions, atomComputedValues));
+        threadData.push_back(new ThreadData(compiledEnergyExpression, energyVecExpression, compiledForceExpression, forceVecExpression, parameterNames,
+                compiledDerivExpressions, computedValueNames, compiledValueExpressions, atomComputedValues));
 }

 CpuCustomNonbondedForce::~CpuCustomNonbondedForce() {
@@ -186,9 +215,27 @@ void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, v

 void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    int numThreads = threads.getNumThreads();
+    int blockSize = getVectorWidth();
    ThreadData& data = *threadData[threadIndex];
-    for (auto& param : *globalParameters)
+    for (auto& param : *globalParameters) {
        data.expressionSet.setVariable(data.expressionSet.getVariableIndex(param.first), param.second);
+        try {
+            float* p = data.energyVecExpression.getVariablePointer(param.first);
+            for (int i = 0; i < blockSize; i++)
+                p[i] = param.second;
+        }
+        catch (...) {
+            // The expression doesn't use this parameter.
+        }
+        try {
+            float* p = data.forceVecExpression.getVariablePointer(param.first);
+            for (int i = 0; i < blockSize; i++)
+                p[i] = param.second;
+        }
+        catch (...) {
+            // The expression doesn't use this parameter.
+        }
+    }

    // Process computed values for this thread's subset of interactions.

@@ -244,20 +291,24 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
            const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
            const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
-            for (int i = 0; i < (int) neighbors.size(); i++) {
-                int first = neighbors[i];
-                for (int j = 0; j < paramNames.size(); j++)
-                    data.particleParam[j*2] = atomParameters[first][j];
-                for (int j = 0; j < computedValueNames.size(); j++)
-                    data.computedValues[j*2] = atomComputedValues[j][first];
-                for (int k = 0; k < blockSize; k++) {
-                    if ((exclusions[i] & (1<<k)) == 0) {
-                        int second = blockAtom[k];
-                        for (int j = 0; j < paramNames.size(); j++)
-                            data.particleParam[j*2+1] = atomParameters[second][j];
-                        for (int j = 0; j < computedValueNames.size(); j++)
-                            data.computedValues[j*2+1] = atomComputedValues[j][second];
-                        calculateOneIxn(first, second, data, forces, energy, boxSize, invBoxSize);
+            if (data.energyParamDerivs.size() == 0)
+                calculateBlockIxn(data, blockIndex, forces, energy, boxSize, invBoxSize);
+            else {
+                for (int i = 0; i < (int) neighbors.size(); i++) {
+                    int first = neighbors[i];
+                    for (int j = 0; j < paramNames.size(); j++)
+                        data.particleParam[j*2] = atomParameters[first][j];
+                    for (int j = 0; j < computedValueNames.size(); j++)
+                        data.computedValues[j*2] = atomComputedValues[j][first];
+                    for (int k = 0; k < blockSize; k++) {
+                        if ((exclusions[i] & (1<<k)) == 0) {
+                            int second = blockAtom[k];
+                            for (int j = 0; j < paramNames.size(); j++)
+                                data.particleParam[j*2+1] = atomParameters[second][j];
+                            for (int j = 0; j < computedValueNames.size(); j++)
+                                data.computedValues[j*2+1] = atomComputedValues[j][second];
+                            calculateOneIxn(first, second, data, forces, energy, boxSize, invBoxSize);
+                        }
                    }
                }
            }

--- a/platforms/cpu/src/CpuCustomNonbondedForceAvx.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForceAvx.cpp
+
+/* Portions copyright (c) 2022 Stanford University and Simbios.
+ * Contributors: Peter Eastman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "CpuCustomNonbondedForceFvec.h"
+#include "openmm/OpenMMException.h"
+
+#ifdef __AVX__
+#include "openmm/internal/vectorizeAvx.h"
+
+OpenMM::CpuCustomNonbondedForce* createCpuCustomNonbondedForceAvx(OpenMM::ThreadPool& threads) {
+    return new OpenMM::CpuCustomNonbondedForceFvec<fvec8, 8>(threads);
+}
+
+#else
+OpenMM::CpuCustomNonbondedForce* createCpuCustomNonbondedForceAvx(OpenMM::ThreadPool& threads) {
+   throw OpenMM::OpenMMException("Internal error: OpenMM was compiled without AVX support");
+}
+#endif
--- a/platforms/cpu/src/CpuCustomNonbondedForceFvec.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForceFvec.cpp
+
+/* Portions copyright (c) 2022 Stanford University and Simbios.
+ * Contributors: Peter Eastman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "CpuCustomNonbondedForceFvec.h"
+#include "openmm/internal/hardware.h"
+
+OpenMM::CpuCustomNonbondedForce* createCpuCustomNonbondedForceVec4(OpenMM::ThreadPool& threads);
+OpenMM::CpuCustomNonbondedForce* createCpuCustomNonbondedForceAvx(OpenMM::ThreadPool& threads);
+
+OpenMM::CpuCustomNonbondedForce* OpenMM::createCpuCustomNonbondedForce(OpenMM::ThreadPool& threads) {
+    if (isAvxSupported())
+        return createCpuCustomNonbondedForceAvx(threads);
+    else
+        return createCpuCustomNonbondedForceVec4(threads);
+}
--- a/platforms/cpu/src/CpuCustomNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForceVec4.cpp
+/* Portions copyright (c) 2022 Stanford University and Simbios.
+ * Contributors: Peter Eastman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "CpuCustomNonbondedForceFvec.h"
+
+OpenMM::CpuCustomNonbondedForce* createCpuCustomNonbondedForceVec4(OpenMM::ThreadPool& threads) {
+    return new OpenMM::CpuCustomNonbondedForceFvec<fvec4, 4>(threads);
+}
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -921,9 +921,8 @@ void CpuCalcCustomNonbondedForceKernel::createInteraction(const CustomNonbondedF

    // Parse the various expressions used to calculate the force.

-    Lepton::ParsedExpression expression = Lepton::Parser::parse(force.getEnergyFunction(), functions).optimize();
-    Lepton::CompiledExpression energyExpression = expression.createCompiledExpression();
-    Lepton::CompiledExpression forceExpression = expression.differentiate("r").createCompiledExpression();
+    Lepton::ParsedExpression energyExpression = Lepton::Parser::parse(force.getEnergyFunction(), functions).optimize();
+    Lepton::ParsedExpression forceExpression = energyExpression.differentiate("r");
    for (int i = 0; i < force.getNumPerParticleParameters(); i++)
        parameterNames.push_back(force.getPerParticleParameterName(i));
    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -940,25 +939,25 @@ void CpuCalcCustomNonbondedForceKernel::createInteraction(const CustomNonbondedF
    }
    particleVariables.insert(globalParameterNames.begin(), globalParameterNames.end());
    pairVariables.insert(globalParameterNames.begin(), globalParameterNames.end());
-    vector<Lepton::CompiledExpression> computedValueExpressions, energyParamDerivExpressions;
+    vector<Lepton::ParsedExpression> computedValueExpressions, energyParamDerivExpressions;
    for (int i = 0; i < force.getNumComputedValues(); i++) {
        string name, exp;
        force.getComputedValueParameters(i, name, exp);
        Lepton::ParsedExpression parsed = Lepton::Parser::parse(exp, functions);
        validateVariables(parsed.getRootNode(), particleVariables);
        computedValueNames.push_back(name);
-        computedValueExpressions.push_back(parsed.createCompiledExpression());
+        computedValueExpressions.push_back(parsed);
    }
    for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
        string param = force.getEnergyParameterDerivativeName(i);
        energyParamDerivNames.push_back(param);
-        energyParamDerivExpressions.push_back(expression.differentiate(param).createCompiledExpression());
+        energyParamDerivExpressions.push_back(energyExpression.differentiate(param));
    }
    for (auto& name : computedValueNames) {
        pairVariables.insert(name+"1");
        pairVariables.insert(name+"2");
    }
-    validateVariables(expression.getRootNode(), pairVariables);
+    validateVariables(energyExpression.getRootNode(), pairVariables);

    // Delete the custom functions.

@@ -967,8 +966,9 @@ void CpuCalcCustomNonbondedForceKernel::createInteraction(const CustomNonbondedF

    // Create the object that computes the interaction.

-    nonbonded = new CpuCustomNonbondedForce(energyExpression, forceExpression, parameterNames, exclusions, energyParamDerivExpressions,
-            computedValueNames, computedValueExpressions, data.threads);
+    nonbonded = createCpuCustomNonbondedForce(data.threads);
+    nonbonded->initialize(energyExpression, forceExpression, parameterNames, exclusions, energyParamDerivExpressions,
+            computedValueNames, computedValueExpressions);
    if (interactionGroups.size() > 0)
        nonbonded->setInteractionGroups(interactionGroups);
 }

--- a/platforms/cpu/src/CpuNonbondedForceAvx.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceAvx.cpp
@@ -26,29 +26,13 @@
 #include "openmm/OpenMMException.h"

 #ifdef __AVX__
-
 #include "openmm/internal/vectorizeAvx.h"

-bool isAvxSupported() {
-    // Make sure the CPU supports AVX.
-    int cpuInfo[4];
-    cpuid(cpuInfo, 0);
-    if (cpuInfo[0] >= 1) {
-        cpuid(cpuInfo, 1);
-        return ((cpuInfo[2] & ((int) 1 << 28)) != 0);
-    }
-    return false;
-}
-
 OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx() {
    return new OpenMM::CpuNonbondedForceFvec<fvec8>();
 }

 #else
-bool isAvxSupported() {
-    return false;
-}
-
 OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx() {
   throw OpenMM::OpenMMException("Internal error: OpenMM was compiled without AVX support");
 }

--- a/platforms/cpu/src/CpuNonbondedForceFvec.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceFvec.cpp

-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
- * Contributors: Daniel Towner
+/* Portions copyright (c) 2006-2022 Stanford University and Simbios.
+ * Contributors: Daniel Towner, Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@@ -23,12 +23,12 @@
 */

 #include "CpuNonbondedForceFvec.h"
+#include "openmm/internal/hardware.h"

 OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec4();
 OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx();
 OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx2();

-bool isAvxSupported();
 bool isAvx2Supported();

 #include <iostream>
@@ -41,10 +41,3 @@ OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec() {
    else
        return createCpuNonbondedForceVec4();
 }
-
-int getVecBlockSize() {
-    if (isAvx2Supported() || isAvxSupported())
-        return 8;
-    else
-        return 4;
-}
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -165,14 +165,9 @@ CpuPlatform::PlatformData::~PlatformData() {
        delete neighborList;
 }

-/**
- * Return how much vectorisation is supported for host platform.
- */
-int getVecBlockSize();
-
 void CpuPlatform::PlatformData::requestNeighborList(double cutoffDistance, double padding, bool useExclusions, const vector<set<int> >& exclusionList) {
    if (neighborList == NULL)
-        neighborList = new CpuNeighborList(getVecBlockSize());
+        neighborList = new CpuNeighborList(getVectorWidth());
    if (cutoffDistance > cutoff)
        cutoff = cutoffDistance;
    if (cutoffDistance+padding > paddedCutoff)