Automatically select whether to use the SSE or AVX version of CpuNonbondedForce

d4af8747 · peastman · 5cd23acb · d4af8747 · d4af8747 · d4af8747
Commit d4af8747 authored Dec 18, 2013 by peastman
9 changed files
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -88,9 +88,7 @@ private:
 */
 class CpuCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
-    CpuCalcNonbondedForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
-            data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false) {
-    }
+    CpuCalcNonbondedForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data);
    ~CpuCalcNonbondedForceKernel();
    /**
     * Initialize the kernel.
@@ -130,8 +128,8 @@ private:
    std::vector<std::pair<float, float> > particleParams;
    std::vector<RealVec> lastPositions;
    NonbondedMethod nonbondedMethod;
-    CpuNeighborList neighborList;
-    CpuNonbondedForce nonbonded;
+    CpuNeighborList* neighborList;
+    CpuNonbondedForce* nonbonded;
    Kernel optimizedPme;
 };


--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -49,6 +49,12 @@ class CpuNonbondedForce {

       CpuNonbondedForce();
       
+        /**
+         * Virtual destructor.
+         */
+
+        virtual ~CpuNonbondedForce();
+        
      /**---------------------------------------------------------------------------------------
      
         Set the force to use a cutoff.
@@ -151,7 +157,7 @@ class CpuNonbondedForce {
     */
    void threadComputeDirect(ThreadPool& threads, int threadIndex);

-private:
+protected:
        bool cutoff;
        bool useSwitch;
        bool periodic;
@@ -204,7 +210,7 @@ private:
            
         --------------------------------------------------------------------------------------- */
          
-      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      virtual void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;
            
      /**---------------------------------------------------------------------------------------
      
@@ -216,7 +222,7 @@ private:
            
         --------------------------------------------------------------------------------------- */
          
-      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      virtual void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;

      /**
       * Compute the displacement and squared distance between two points, optionally using
@@ -224,26 +230,15 @@ private:
       */
      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

-      /**
-       * Compute the displacement and squared distance between a collection of points, optionally using
-       * periodic boundary conditions.
-       */
-      void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
-
-      /**
-       * Compute a fast approximation to erfc(x).
-       */
-      static fvec4 erfcApprox(fvec4 x);
-
      /**
       * Create a lookup table for the scale factor used with Ewald and PME.
       */
      void tabulateEwaldScaleFactor();

      /**
-       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       * Compute a fast approximation to erfc(x).
       */
-      fvec4 ewaldScaleFunction(fvec4 x);
+      static float erfcApprox(float x);
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
+#define OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
+
+#include "CpuNonbondedForce.h"
+// ---------------------------------------------------------------------------------------
+
+namespace OpenMM {
+
+class CpuNonbondedForceVec4 : public CpuNonbondedForce {
+public:
+      /**---------------------------------------------------------------------------------------
+      
+         Constructor
+      
+         --------------------------------------------------------------------------------------- */
+
+       CpuNonbondedForceVec4();
+
+protected:
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+            
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+
+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+
+      /**
+       * Compute a fast approximation to erfc(x).
+       */
+      static fvec4 erfcApprox(fvec4 x);
+      
+      /**
+       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       */
+      fvec4 ewaldScaleFunction(fvec4 x);
+};
+
+} // namespace OpenMM
+
+// ---------------------------------------------------------------------------------------
+
+#endif // OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
@@ -22,178 +22,23 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
-#define OPENMM_CPU_NONBONDED_FORCE_H__
+#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
+#define OPENMM_CPU_NONBONDED_FORCE_VEC8_H__

-#include "AlignedArray.h"
-#include "CpuNeighborList.h"
-#include "ReferencePairIxn.h"
-#include "openmm/internal/ThreadPool.h"
+#ifdef __AVX__
+
+#include "CpuNonbondedForce.h"
 #include "openmm/internal/vectorize8.h"
-#include <set>
-#include <utility>
-#include <vector>
+
 // ---------------------------------------------------------------------------------------

 namespace OpenMM {

-class CpuNonbondedForceVec8 {
-    public:
-        class ComputeDirectTask;
-
-      /**---------------------------------------------------------------------------------------
-      
-         Constructor
-      
-         --------------------------------------------------------------------------------------- */
-
+class CpuNonbondedForceVec8 : public CpuNonbondedForce {
+public:
       CpuNonbondedForceVec8();

-      /**---------------------------------------------------------------------------------------
-      
-         Set the force to use a cutoff.
-      
-         @param distance            the cutoff distance
-         @param neighbors           the neighbor list to use
-         @param solventDielectric   the dielectric constant of the bulk solvent
-      
-         --------------------------------------------------------------------------------------- */
-      
-      void setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric);
-
-      /**---------------------------------------------------------------------------------------
-      
-         Set the force to use a switching function on the Lennard-Jones interaction.
-      
-         @param distance            the switching distance
-      
-         --------------------------------------------------------------------------------------- */
-      
-      void setUseSwitchingFunction(float distance);
-      
-      /**---------------------------------------------------------------------------------------
-      
-         Set the force to use periodic boundary conditions.  This requires that a cutoff has
-         already been set, and the smallest side of the periodic box is at least twice the cutoff
-         distance.
-      
-         @param boxSize             the X, Y, and Z widths of the periodic box
-      
-         --------------------------------------------------------------------------------------- */
-      
-      void setPeriodic(float* periodicBoxSize);
-       
-      /**---------------------------------------------------------------------------------------
-      
-         Set the force to use Ewald summation.
-      
-         @param alpha  the Ewald separation parameter
-         @param kmaxx  the largest wave vector in the x direction
-         @param kmaxy  the largest wave vector in the y direction
-         @param kmaxz  the largest wave vector in the z direction
-      
-         --------------------------------------------------------------------------------------- */
-      
-      void setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz);
-
-     
-      /**---------------------------------------------------------------------------------------
-      
-         Set the force to use Particle-Mesh Ewald (PME) summation.
-      
-         @param alpha    the Ewald separation parameter
-         @param gridSize the dimensions of the mesh
-      
-         --------------------------------------------------------------------------------------- */
-      
-      void setUsePME(float alpha, int meshSize[3]);
-
-      /**---------------------------------------------------------------------------------------
-      
-         Calculate Ewald ixn
-      
-         @param numberOfAtoms    number of atoms
-         @param posq             atom coordinates and charges
-         @param atomCoordinates  atom coordinates (in format needed by PME)
-         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
-         @param exclusions       atom exclusion indices
-                                 exclusions[atomIndex] contains the list of exclusions for that atom
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-            
-         --------------------------------------------------------------------------------------- */
-          
-      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
-                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
-                            std::vector<RealVec>& forces, double* totalEnergy) const;
-      
-      /**---------------------------------------------------------------------------------------
-      
-         Calculate LJ Coulomb pair ixn
-      
-         @param numberOfAtoms    number of atoms
-         @param posq             atom coordinates and charges
-         @param atomCoordinates  atom coordinates (periodic boundary conditions not applied)
-         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
-         @param exclusions       atom exclusion indices
-                                 exclusions[atomIndex] contains the list of exclusions for that atom
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-         @param threads          the thread pool to use
-      
-         --------------------------------------------------------------------------------------- */
-          
-      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
-
-    /**
-     * This routine contains the code executed by each thread.
-     */
-    void threadComputeDirect(ThreadPool& threads, int threadIndex);
-
-private:
-        bool cutoff;
-        bool useSwitch;
-        bool periodic;
-        bool ewald;
-        bool pme;
-        bool tableIsValid;
-        const CpuNeighborList* neighborList;
-        float periodicBoxSize[3];
-        float cutoffDistance, switchingDistance;
-        float krf, crf;
-        float alphaEwald;
-        int numRx, numRy, numRz;
-        int meshDim[3];
-        std::vector<float> ewaldScaleTable;
-        float ewaldDX, ewaldDXInv;
-        std::vector<double> threadEnergy;
-        // The following variables are used to make information accessible to the individual threads.
-        int numberOfAtoms;
-        float* posq;
-        RealVec const* atomCoordinates;
-        std::pair<float, float> const* atomParameters;        
-        std::set<int> const* exclusions;
-        std::vector<AlignedArray<float> >* threadForce;
-        bool includeEnergy;
-        void* atomicCounter;
-
-        static const float TWO_OVER_SQRT_PI;
-        static const int NUM_TABLE_POINTS;
-            
-      /**---------------------------------------------------------------------------------------
-      
-         Calculate LJ Coulomb pair ixn between two atoms
-      
-         @param atom1            the index of the first atom
-         @param atom2            the index of the second atom
-         @param forces           force array (forces added)
-         @param totalEnergy      total energy
-            
-         --------------------------------------------------------------------------------------- */
-          
-      void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
-            
+protected:            
      /**---------------------------------------------------------------------------------------
      
         Calculate all the interactions for one atom block.
@@ -218,12 +63,6 @@ private:
          
      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);

-      /**
-       * Compute the displacement and squared distance between two points, optionally using
-       * periodic boundary conditions.
-       */
-      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
-
      /**
       * Compute the displacement and squared distance between a collection of points, optionally using
       * periodic boundary conditions.
@@ -235,11 +74,6 @@ private:
       */
      static fvec8 erfcApprox(fvec8 x);
      
-      /**
-       * Create a lookup table for the scale factor used with Ewald and PME.
-       */
-      void tabulateEwaldScaleFactor();
-      
      /**
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
@@ -250,4 +84,6 @@ private:

 // ---------------------------------------------------------------------------------------

-#endif // OPENMM_CPU_NONBONDED_FORCE_H__
+#endif // __AVX__
+
+#endif // OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
-SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
+FOREACH(file ${SOURCE_FILES})
+    IF (file MATCHES ".*Vec8.*")
+        SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -mavx")
+    ENDIF (file MATCHES ".*Vec8.*")
+ENDFOREACH(file)
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
@@ -7,6 +11,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -145,6 +145,22 @@ private:
    int numParticles;
 };

+bool isVec8Supported();
+CpuNonbondedForce* createCpuNonbondedForceVec4();
+CpuNonbondedForce* createCpuNonbondedForceVec8();
+
+CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
+        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), neighborList(NULL), nonbonded(NULL) {
+    if (isVec8Supported) {
+        neighborList = new CpuNeighborList(8);
+        nonbonded = createCpuNonbondedForceVec8();
+    }
+    else {
+        neighborList = new CpuNeighborList(4);
+        nonbonded = createCpuNonbondedForceVec4();
+    }
+}
+
 CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
    if (bonded14ParamArray != NULL) {
        for (int i = 0; i < num14; i++) {
@@ -154,6 +170,10 @@ CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
        delete bonded14IndexArray;
        delete bonded14ParamArray;
    }
+    if (nonbonded != NULL)
+        delete nonbonded;
+    if (neighborList != NULL)
+        delete neighborList;
 }

 void CpuCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
@@ -305,26 +325,26 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                }
        }
        if (needRecompute) {
-            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
+            neighborList->computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
            lastPositions = posData;
        }
-        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
+        nonbonded->setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
    }
    if (data.isPeriodic) {
        double minAllowedSize = 1.999999*nonbondedCutoff;
        if (boxSize[0] < minAllowedSize || boxSize[1] < minAllowedSize || boxSize[2] < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
-        nonbonded.setPeriodic(floatBoxSize);
+        nonbonded->setPeriodic(floatBoxSize);
    }
    if (ewald)
-        nonbonded.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
+        nonbonded->setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
    if (pme)
-        nonbonded.setUsePME(ewaldAlpha, gridSize);
+        nonbonded->setUsePME(ewaldAlpha, gridSize);
    if (useSwitchingFunction)
-        nonbonded.setUseSwitchingFunction(switchingDistance);
+        nonbonded->setUseSwitchingFunction(switchingDistance);
    double nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded.calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
+        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
@@ -333,7 +353,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
-            nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
+            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
    }
    energy += nonbondedEnergy;
    if (includeDirect) {

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -29,7 +29,6 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/vectorize.h"
 #include "gmx_atomic.h"

 // In case we're using some primitive version of Visual Studio this will
@@ -61,6 +60,9 @@ public:
 CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
 }

+CpuNonbondedForce::~CpuNonbondedForce() {
+}
+
 /**---------------------------------------------------------------------------------------

   Set the force to use a cutoff.
@@ -356,7 +358,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                    float inverseR = 1/r;
                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
                    float alphaR = alphaEwald*r;
-                    float erfcAlphaR = erfcApprox(alphaR)[0];
+                    float erfcAlphaR = erfcApprox(alphaR);
                    float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
                    dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
                    fvec4 result = deltaR*dEdR;
@@ -446,222 +448,6 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
  }

-void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Load the positions and parameters of the atoms in the block.
-    
-    int blockAtom[4];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++) {
-        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-    }
-    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
-    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
-    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
-    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    
-    // Loop over neighbors for this block.
-    
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        
-        int atom = neighbors[i];
-        
-        // Compute the distances to the block atoms.
-        
-        fvec4 dx, dy, dz, r2;
-        getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        ivec4 include;
-        char excl = exclusions[i];
-        if (excl == 0)
-            include = -1;
-        else
-            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
-        include = include & (r2 < cutoffDistance*cutoffDistance);
-        if (!any(include))
-            continue; // No interactions to compute.
-        
-        // Compute the interactions.
-        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 energy, dEdR;
-        float atomEpsilon = atomParameters[atom].second;
-        if (atomEpsilon != 0.0f) {
-            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-            fvec4 sig2 = inverseR*sig;
-            sig2 *= sig2;
-            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
-            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-            energy = epsSig6*(sig6-1.0f);
-            if (useSwitch) {
-                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
-                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
-                dEdR = switchValue*dEdR - energy*switchDeriv*r;
-                energy *= switchValue;
-            }
-        }
-        else {
-            energy = 0.0f;
-            dEdR = 0.0f;
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        if (cutoff)
-            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
-        else
-            dEdR += chargeProd*inverseR;
-        dEdR *= inverseR*inverseR;
-
-        // Accumulate energies.
-
-        fvec4 one(1.0f);
-        if (totalEnergy) {
-            if (cutoff)
-                energy += chargeProd*(inverseR+krf*r2-crf);
-            else
-                energy += chargeProd*inverseR;
-            energy = blend(0.0f, energy, include);
-            *totalEnergy += dot4(energy, one);
-        }
-
-        // Accumulate forces.
-
-        dEdR = blend(0.0f, dEdR, include);
-        fvec4 fx = dx*dEdR;
-        fvec4 fy = dy*dEdR;
-        fvec4 fz = dz*dEdR;
-        blockAtomForceX += fx;
-        blockAtomForceY += fy;
-        blockAtomForceZ += fz;
-        float* atomForce = forces+4*atom;
-        atomForce[0] -= dot4(fx, one);
-        atomForce[1] -= dot4(fy, one);
-        atomForce[2] -= dot4(fz, one);
-    }
-    
-    // Record the forces on the block atoms.
-
-    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
-    transpose(f[0], f[1], f[2], f[3]);
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
-  }
-
-void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Load the positions and parameters of the atoms in the block.
-    
-    int blockAtom[4];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++) {
-        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-    }
-    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
-    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
-    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
-    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    
-    // Loop over neighbors for this block.
-    
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        
-        int atom = neighbors[i];
-        
-        // Compute the distances to the block atoms.
-        
-        fvec4 dx, dy, dz, r2;
-        getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        ivec4 include;
-        char excl = exclusions[i];
-        if (excl == 0)
-            include = -1;
-        else
-            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
-        include = include & (r2 < cutoffDistance*cutoffDistance);
-        if (!any(include))
-            continue; // No interactions to compute.
-        
-        // Compute the interactions.
-        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 energy, dEdR;
-        float atomEpsilon = atomParameters[atom].second;
-        if (atomEpsilon != 0.0f) {
-            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-            fvec4 sig2 = inverseR*sig;
-            sig2 *= sig2;
-            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
-            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-            energy = epsSig6*(sig6-1.0f);
-            if (useSwitch) {
-                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
-                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
-                dEdR = switchValue*dEdR - energy*switchDeriv*r;
-                energy *= switchValue;
-            }
-        }
-        else {
-            energy = 0.0f;
-            dEdR = 0.0f;
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
-        dEdR *= inverseR*inverseR;        
-
-        // Accumulate energies.
-
-        fvec4 one(1.0f);
-        if (totalEnergy) {
-            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
-            energy = blend(0.0f, energy, include);
-            *totalEnergy += dot4(energy, one);
-        }
-
-        // Accumulate forces.
-
-        dEdR = blend(0.0f, dEdR, include);
-        fvec4 fx = dx*dEdR;
-        fvec4 fy = dy*dEdR;
-        fvec4 fz = dz*dEdR;
-        blockAtomForceX += fx;
-        blockAtomForceY += fy;
-        blockAtomForceZ += fz;
-        float* atomForce = forces+4*atom;
-        atomForce[0] -= dot4(fx, one);
-        atomForce[1] -= dot4(fy, one);
-        atomForce[2] -= dot4(fz, one);
-    }
-    
-    // Record the forces on the block atoms.
-    
-    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
-    transpose(f[0], f[1], f[2], f[3]);
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
-}
-
 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    deltaR = posJ-posI;
    if (periodic) {
@@ -671,41 +457,15 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
    r2 = dot3(deltaR, deltaR);
 }

-void CpuNonbondedForce::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
-    dx = x-posI[0];
-    dy = y-posI[1];
-    dz = z-posI[2];
-    if (periodic) {
-        dx -= round(dx*invBoxSize[0])*boxSize[0];
-        dy -= round(dy*invBoxSize[1])*boxSize[1];
-        dz -= round(dz*invBoxSize[2])*boxSize[2];
-    }
-    r2 = dx*dx + dy*dy + dz*dz;
-}
-
-fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
+float CpuNonbondedForce::erfcApprox(float x) {
    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
    // error of 3e-7.

-    fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
    t *= t;
    t *= t;
    t *= t;
    return 1.0f/(t*t);
 }

-fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
-    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-
-    fvec4 x1 = x*ewaldDXInv;
-    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = x1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&ewaldScaleTable[index[0]]);
-    fvec4 t2(&ewaldScaleTable[index[1]]);
-    fvec4 t3(&ewaldScaleTable[index[2]]);
-    fvec4 t4(&ewaldScaleTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
-}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "SimTKOpenMMCommon.h"
+#include "SimTKOpenMMUtilities.h"
+#include "CpuNonbondedForceVec4.h"
+
+using namespace std;
+using namespace OpenMM;
+
+/**
+ * Factory method to create a CpuNonbondedForceVec4.
+ */
+CpuNonbondedForce* createCpuNonbondedForceVec4() {
+    return new CpuNonbondedForceVec4();
+}
+
+/**---------------------------------------------------------------------------------------
+
+   CpuNonbondedForceVec4 constructor
+
+   --------------------------------------------------------------------------------------- */
+
+CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
+}
+
+void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[4];
+    fvec4 blockAtomPosq[4];
+    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    for (int i = 0; i < 4; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
+    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
+    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
+    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec4 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec4 r = sqrt(r2);
+        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec4 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec4 sig6 = sig2*sig2*sig2;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (cutoff)
+            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+        else
+            dEdR += chargeProd*inverseR;
+        dEdR *= inverseR*inverseR;
+
+        // Accumulate energies.
+
+        fvec4 one(1.0f);
+        if (totalEnergy) {
+            if (cutoff)
+                energy += chargeProd*(inverseR+krf*r2-crf);
+            else
+                energy += chargeProd*inverseR;
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot4(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec4 fx = dx*dEdR;
+        fvec4 fy = dy*dEdR;
+        fvec4 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot4(fx, one);
+        atomForce[1] -= dot4(fy, one);
+        atomForce[2] -= dot4(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+
+    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
+    transpose(f[0], f[1], f[2], f[3]);
+    for (int j = 0; j < 4; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+  }
+
+void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[4];
+    fvec4 blockAtomPosq[4];
+    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    for (int i = 0; i < 4; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
+    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
+    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
+    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec4 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec4 r = sqrt(r2);
+        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec4 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec4 sig6 = sig2*sig2*sig2;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
+        dEdR *= inverseR*inverseR;        
+
+        // Accumulate energies.
+
+        fvec4 one(1.0f);
+        if (totalEnergy) {
+            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot4(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec4 fx = dx*dEdR;
+        fvec4 fy = dy*dEdR;
+        fvec4 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot4(fx, one);
+        atomForce[1] -= dot4(fy, one);
+        atomForce[2] -= dot4(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+    
+    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
+    transpose(f[0], f[1], f[2], f[3]);
+    for (int j = 0; j < 4; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+}
+
+void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
+fvec4 CpuNonbondedForceVec4::erfcApprox(fvec4 x) {
+    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+    // error of 3e-7.
+
+    fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    t *= t;
+    t *= t;
+    t *= t;
+    return 1.0f/(t*t);
+}
+
+fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(fvec4 x) {
+    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+
+    fvec4 x1 = x*ewaldDXInv;
+    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&ewaldScaleTable[index[0]]);
+    fvec4 t2(&ewaldScaleTable[index[1]]);
+    fvec4 t3(&ewaldScaleTable[index[2]]);
+    fvec4 t4(&ewaldScaleTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -22,430 +22,54 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include <complex>
-
 #include "SimTKOpenMMCommon.h"
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForceVec8.h"
-#include "ReferenceForce.h"
-#include "ReferencePME.h"
-#include "openmm/internal/vectorize.h"
-#include "gmx_atomic.h"
-
-// In case we're using some primitive version of Visual Studio this will
-// make sure that erf() and erfc() are defined.
-#include "openmm/internal/MSVC_erfc.h"
+#include "openmm/internal/hardware.h"

 using namespace std;
 using namespace OpenMM;

-const float CpuNonbondedForceVec8::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
-const int CpuNonbondedForceVec8::NUM_TABLE_POINTS = 2048;
-
-class CpuNonbondedForceVec8::ComputeDirectTask : public ThreadPool::Task {
-public:
-    ComputeDirectTask(CpuNonbondedForceVec8& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeDirect(threads, threadIndex);
-    }
-    CpuNonbondedForceVec8& owner;
-};
-
-/**---------------------------------------------------------------------------------------
-
-   CpuNonbondedForceVec8 constructor
-
-   --------------------------------------------------------------------------------------- */
-
-CpuNonbondedForceVec8::CpuNonbondedForceVec8() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
+#ifndef __AVX__
+bool isVec8Supported() {
+    return false;
 }

-/**---------------------------------------------------------------------------------------
-
-   Set the force to use a cutoff.
-
-   @param distance            the cutoff distance
-   @param neighbors           the neighbor list to use
-   @param solventDielectric   the dielectric constant of the bulk solvent
-
-     --------------------------------------------------------------------------------------- */
-
-void CpuNonbondedForceVec8::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
-    if (distance != cutoffDistance)
-        tableIsValid = false;
-    cutoff = true;
-    cutoffDistance = distance;
-    neighborList = &neighbors;
-    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
-    crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
-  }
-
-/**---------------------------------------------------------------------------------------
-
-   Set the force to use a switching function on the Lennard-Jones interaction.
-
-   @param distance            the switching distance
-
-   --------------------------------------------------------------------------------------- */
-
-void CpuNonbondedForceVec8::setUseSwitchingFunction(float distance) {
-    useSwitch = true;
-    switchingDistance = distance;
+CpuNonbondedForce* createCpuNonbondedForceVec8() {
+    throw OpenMMException("Internal error: OpenMM was compiled without AVX support");
 }
+#else
+/**
+ * Check whether 8 component vectors are supported with the current CPU.
+ */
+bool isVec8Supported() {
+    // Make sure the CPU supports AVX.
    
-  /**---------------------------------------------------------------------------------------
-
-     Set the force to use periodic boundary conditions.  This requires that a cutoff has
-     also been set, and the smallest side of the periodic box is at least twice the cutoff
-     distance.
-
-     @param boxSize             the X, Y, and Z widths of the periodic box
-
-     --------------------------------------------------------------------------------------- */
-
-  void CpuNonbondedForceVec8::setPeriodic(float* periodicBoxSize) {
-
-    assert(cutoff);
-    assert(periodicBoxSize[0] >= 2*cutoffDistance);
-    assert(periodicBoxSize[1] >= 2*cutoffDistance);
-    assert(periodicBoxSize[2] >= 2*cutoffDistance);
-    periodic = true;
-    this->periodicBoxSize[0] = periodicBoxSize[0];
-    this->periodicBoxSize[1] = periodicBoxSize[1];
-    this->periodicBoxSize[2] = periodicBoxSize[2];
-  }
-
-  /**---------------------------------------------------------------------------------------
-
-     Set the force to use Ewald summation.
-
-     @param alpha  the Ewald separation parameter
-     @param kmaxx  the largest wave vector in the x direction
-     @param kmaxy  the largest wave vector in the y direction
-     @param kmaxz  the largest wave vector in the z direction
-
-     --------------------------------------------------------------------------------------- */
-
-  void CpuNonbondedForceVec8::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
-      if (alpha != alphaEwald)
-          tableIsValid = false;
-      alphaEwald = alpha;
-      numRx = kmaxx;
-      numRy = kmaxy;
-      numRz = kmaxz;
-      ewald = true;
-      tabulateEwaldScaleFactor();
-  }
-
-  /**---------------------------------------------------------------------------------------
-
-     Set the force to use Particle-Mesh Ewald (PME) summation.
-
-     @param alpha  the Ewald separation parameter
-     @param gridSize the dimensions of the mesh
-
-     --------------------------------------------------------------------------------------- */
-
-  void CpuNonbondedForceVec8::setUsePME(float alpha, int meshSize[3]) {
-      if (alpha != alphaEwald)
-          tableIsValid = false;
-      alphaEwald = alpha;
-      meshDim[0] = meshSize[0];
-      meshDim[1] = meshSize[1];
-      meshDim[2] = meshSize[2];
-      pme = true;
-      tabulateEwaldScaleFactor();
-  }
-
-  
-void CpuNonbondedForceVec8::tabulateEwaldScaleFactor() {
-    if (tableIsValid)
-        return;
-    tableIsValid = true;
-    ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
-    ewaldDXInv = 1.0f/ewaldDX;
-    ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
-    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
-        double r = i*ewaldDX;
-        double alphaR = alphaEwald*r;
-        ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+    if (cpuInfo[0] >= 1) {
+        cpuid(cpuInfo, 1);
+        return ((cpuInfo[2] & ((int) 1 << 28)) != 0);
    }
+    return false;
 }

-void CpuNonbondedForceVec8::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
-                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
-                                             vector<RealVec>& forces, double* totalEnergy) const {
-    typedef std::complex<float> d_complex;
-
-    static const float epsilon     =  1.0;
-
-    int kmax                            = (ewald ? std::max(numRx, std::max(numRy,numRz)) : 0);
-    float factorEwald              = -1 / (4*alphaEwald*alphaEwald);
-    float TWO_PI                   = 2.0 * PI_M;
-    float recipCoeff               = (float)(ONE_4PI_EPS0*4*PI_M/(periodicBoxSize[0] * periodicBoxSize[1] * periodicBoxSize[2]) /epsilon);
-
-    if (pme) {
-        pme_t pmedata;
-        RealOpenMM virial[3][3];
-        pme_init(&pmedata, alphaEwald, numberOfAtoms, meshDim, 5, 1);
-        vector<RealOpenMM> charges(numberOfAtoms);
-        for (int i = 0; i < numberOfAtoms; i++)
-            charges[i] = posq[4*i+3];
-        RealOpenMM boxSize[3] = {periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2]};
-        RealOpenMM recipEnergy = 0.0;
-        pme_exec(pmedata, atomCoordinates, forces, charges, boxSize, &recipEnergy, virial);
-        if (totalEnergy)
-            *totalEnergy += recipEnergy;
-        pme_destroy(pmedata);
-    }
-
-    // Ewald method
-
-    else if (ewald) {
-
-        // setup reciprocal box
-
-        float recipBoxSize[3] = { TWO_PI / periodicBoxSize[0], TWO_PI / periodicBoxSize[1], TWO_PI / periodicBoxSize[2]};
-
-
-        // setup K-vectors
-
-        #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
-        vector<d_complex> eir(kmax*numberOfAtoms*3);
-        vector<d_complex> tab_xy(numberOfAtoms);
-        vector<d_complex> tab_qxyz(numberOfAtoms);
-
-        for (int i = 0; (i < numberOfAtoms); i++) {
-            float* pos = posq+4*i;
-            for (int m = 0; (m < 3); m++)
-              EIR(0, i, m) = d_complex(1,0);
-
-            for (int m=0; (m<3); m++)
-              EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
-                                       sin(pos[m]*recipBoxSize[m]));
-
-            for (int j=2; (j<kmax); j++)
-              for (int m=0; (m<3); m++)
-                EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
-        }
-
-        // calculate reciprocal space energy and forces
-
-        int lowry = 0;
-        int lowrz = 1;
-
-        for (int rx = 0; rx < numRx; rx++) {
-            float kx = rx * recipBoxSize[0];
-            for (int ry = lowry; ry < numRy; ry++) {
-                float ky = ry * recipBoxSize[1];
-                if (ry >= 0) {
-                    for (int n = 0; n < numberOfAtoms; n++)
-                      tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
-                }
-                else {
-                    for (int n = 0; n < numberOfAtoms; n++)
-                      tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
-                }
-                for (int rz = lowrz; rz < numRz; rz++) {
-                    if (rz >= 0) {
-                        for (int n = 0; n < numberOfAtoms; n++)
-                            tab_qxyz[n] = posq[4*n+3] * (tab_xy[n] * EIR(rz, n, 2));
-                    }
-                    else {
-                        for (int n = 0; n < numberOfAtoms; n++)
-                            tab_qxyz[n] = posq[4*n+3] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
-                    }
-                    float cs = 0.0f;
-                    float ss = 0.0f;
-
-                    for (int n = 0; n < numberOfAtoms; n++) {
-                        cs += tab_qxyz[n].real();
-                        ss += tab_qxyz[n].imag();
-                    }
-
-                    float kz = rz * recipBoxSize[2];
-                    float k2 = kx * kx + ky * ky + kz * kz;
-                    float ak = exp(k2*factorEwald) / k2;
-
-                    for (int n = 0; n < numberOfAtoms; n++) {
-                        float force = ak * (cs * tab_qxyz[n].imag() - ss * tab_qxyz[n].real());
-                        forces[n][0] += 2 * recipCoeff * force * kx;
-                        forces[n][1] += 2 * recipCoeff * force * ky;
-                        forces[n][2] += 2 * recipCoeff * force * kz;
-                    }
-
-                    if (totalEnergy)
-                        *totalEnergy += recipCoeff * ak * (cs * cs + ss * ss);
-
-                    lowrz = 1 - numRz;
-                }
-                lowry = 1 - numRy;
-            }
-        }
-    }
+/**
+ * Factory method to create a CpuNonbondedForceVec8.
+ */
+CpuNonbondedForce* createCpuNonbondedForceVec8() {
+    return new CpuNonbondedForceVec8();
 }

+/**---------------------------------------------------------------------------------------

-void CpuNonbondedForceVec8::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
-    // Record the parameters for the threads.
-    
-    this->numberOfAtoms = numberOfAtoms;
-    this->posq = posq;
-    this->atomCoordinates = &atomCoordinates[0];
-    this->atomParameters = &atomParameters[0];
-    this->exclusions = &exclusions[0];
-    this->threadForce = &threadForce;
-    includeEnergy = (totalEnergy != NULL);
-    threadEnergy.resize(threads.getNumThreads());
-    gmx_atomic_t counter;
-    gmx_atomic_set(&counter, 0);
-    this->atomicCounter = &counter;
-    
-    // Signal the threads to start running and wait for them to finish.
-    
-    ComputeDirectTask task(*this);
-    threads.execute(task);
-    threads.waitForThreads();
-    
-    // Combine the energies from all the threads.
-    
-    if (totalEnergy != NULL) {
-        double directEnergy = 0;
-        int numThreads = threads.getNumThreads();
-        for (int i = 0; i < numThreads; i++)
-            directEnergy += threadEnergy[i];
-        *totalEnergy += directEnergy;
-    }
-}
+   CpuNonbondedForceVec8 constructor

-void CpuNonbondedForceVec8::threadComputeDirect(ThreadPool& threads, int threadIndex) {
-    // Compute this thread's subset of interactions.
-
-    int numThreads = threads.getNumThreads();
-    threadEnergy[threadIndex] = 0;
-    double* energyPtr = (includeEnergy ? &threadEnergy[threadIndex] : NULL);
-    float* forces = &(*threadForce)[threadIndex][0];
-    fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
-    fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
-    if (ewald || pme) {
-        // Compute the interactions from the neighbor list.
-
-        while (true) {
-            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
-            if (nextBlock >= neighborList->getNumBlocks())
-                break;
-            calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
-        }
+   --------------------------------------------------------------------------------------- */

-        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
-
-        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
-        fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
-        for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
-            fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
-            for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
-                if (*iter > i) {
-                    int j = *iter;
-                    fvec4 deltaR;
-                    fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
-                    float r2;
-                    getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
-                    float r = sqrtf(r2);
-                    float inverseR = 1/r;
-                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
-                    float alphaR = alphaEwald*r;
-                    float erfcAlphaR = erfcApprox(alphaR).lowerVec()[0];
-                    float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
-                    dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
-                    fvec4 result = deltaR*dEdR;
-                    (fvec4(forces+4*i)-result).store(forces+4*i);
-                    (fvec4(forces+4*j)+result).store(forces+4*j);
-                    if (includeEnergy)
-                        threadEnergy[threadIndex] -= chargeProd*inverseR*(1.0f-erfcAlphaR);
-                }
-            }
-        }
-    }
-    else if (cutoff) {
-        // Compute the interactions from the neighbor list.
-
-        while (true) {
-            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
-            if (nextBlock >= neighborList->getNumBlocks())
-                break;
-            calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
-        }
-    }
-    else {
-        // Loop over all atom pairs
-
-        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
-            if (i >= numberOfAtoms)
-                break;
-            for (int j = i+1; j < numberOfAtoms; j++)
-                if (exclusions[j].find(i) == exclusions[j].end())
-                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
-        }
-    }
+CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
 }

-void CpuNonbondedForceVec8::calculateOneIxn(int ii, int jj, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // get deltaR, R2, and R between 2 atoms
-
-    fvec4 deltaR;
-    fvec4 posI(posq+4*ii);
-    fvec4 posJ(posq+4*jj);
-    float r2;
-    getDeltaR(posJ, posI, deltaR, r2, periodic, boxSize, invBoxSize);
-    if (cutoff && r2 >= cutoffDistance*cutoffDistance)
-        return;
-    float r = sqrtf(r2);
-    float inverseR = 1/r;
-    float switchValue = 1, switchDeriv = 0;
-    if (useSwitch && r > switchingDistance) {
-        float t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
-        switchValue = 1+t*t*t*(-10+t*(15-t*6));
-        switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
-    }
-    float sig       = atomParameters[ii].first + atomParameters[jj].first;
-    float sig2      = inverseR*sig;
-          sig2     *= sig2;
-    float sig6      = sig2*sig2*sig2;
-
-    float eps       = atomParameters[ii].second*atomParameters[jj].second;
-    float dEdR      = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
-    float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3];
-    if (cutoff)
-        dEdR += (float) (chargeProd*(inverseR-2.0f*krf*r2));
-    else
-        dEdR += (float) (chargeProd*inverseR);
-    dEdR *= inverseR*inverseR;
-    float energy = eps*(sig6-1.0f)*sig6;
-    if (useSwitch) {
-        dEdR -= energy*switchDeriv*inverseR;
-        energy *= switchValue;
-    }
-
-    // accumulate energies
-
-    if (totalEnergy) {
-        if (cutoff)
-            energy += (float) (chargeProd*(inverseR+krf*r2-crf));
-        else
-            energy += (float) (chargeProd*inverseR);
-        *totalEnergy += energy;
-    }
-
-    // accumulate forces
-
-    fvec4 result = deltaR*dEdR;
-    (fvec4(forces+4*ii)+result).store(forces+4*ii);
-    (fvec4(forces+4*jj)-result).store(forces+4*jj);
-  }
-
 void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Load the positions and parameters of the atoms in the block.
    
@@ -660,15 +284,6 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces
        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
 }

-void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
-    deltaR = posJ-posI;
-    if (periodic) {
-        fvec4 base = round(deltaR*invBoxSize)*boxSize;
-        deltaR = deltaR-base;
-    }
-    r2 = dot3(deltaR, deltaR);
-}
-
 void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    dx = x-posI[0];
    dy = y-posI[1];
@@ -714,3 +329,4 @@ fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(fvec8 x) {
    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
    return coeff1*s1 + coeff2*s2;
 }
+#endif