Created AVX implementation of CPU nonbonded force

e22808f3 · peastman · ddad7c6c · e22808f3 · e22808f3 · e22808f3
Commit e22808f3 authored Dec 16, 2013 by peastman
6 changed files
--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
@@ -132,6 +132,12 @@ public:
    operator __m256i() const {
        return val;
    }
+    ivec4 lowerVec() const {
+        return _mm256_castsi256_si128(val);
+    }
+    ivec4 upperVec() const {
+        return _mm256_extractf128_si256(val, 1);
+    }
    void store(int* v) const {
        _mm256_storeu_si256((__m256i*) v, val);
    }

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -45,8 +45,7 @@ class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
    class ThreadTask;
    class Voxels;
-    static const int BlockSize;
+    CpuNeighborList(int blockSize);
-    CpuNeighborList();
    void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
@@ -59,6 +58,7 @@ public:
    void threadComputeNeighborList(ThreadPool& threads, int threadIndex);
    void runThread(int index);
 private:
+    int blockSize;
    std::vector<int> sortedAtoms;
    std::vector<std::vector<int> > blockNeighbors;
    std::vector<std::vector<char> > blockExclusions;

--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
+#define OPENMM_CPU_NONBONDED_FORCE_H__
+#include "AlignedArray.h"
+#include "CpuNeighborList.h"
+#include "ReferencePairIxn.h"
+#include "openmm/internal/ThreadPool.h"
+#include "openmm/internal/vectorize8.h"
+#include <set>
+#include <utility>
+#include <vector>
+// ---------------------------------------------------------------------------------------
+namespace OpenMM {
+class CpuNonbondedForceVec8 {
+    public:
+        class ComputeDirectTask;
+      /**---------------------------------------------------------------------------------------
+         Constructor
+         --------------------------------------------------------------------------------------- */
+       CpuNonbondedForceVec8();
+      /**---------------------------------------------------------------------------------------
+         Set the force to use a cutoff.
+         @param distance            the cutoff distance
+         @param neighbors           the neighbor list to use
+         @param solventDielectric   the dielectric constant of the bulk solvent
+         --------------------------------------------------------------------------------------- */
+      void setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric);
+      /**---------------------------------------------------------------------------------------
+         Set the force to use a switching function on the Lennard-Jones interaction.
+         @param distance            the switching distance
+         --------------------------------------------------------------------------------------- */
+      void setUseSwitchingFunction(float distance);
+      /**---------------------------------------------------------------------------------------
+         Set the force to use periodic boundary conditions.  This requires that a cutoff has
+         already been set, and the smallest side of the periodic box is at least twice the cutoff
+         distance.
+         @param boxSize             the X, Y, and Z widths of the periodic box
+         --------------------------------------------------------------------------------------- */
+      void setPeriodic(float* periodicBoxSize);
+      /**---------------------------------------------------------------------------------------
+         Set the force to use Ewald summation.
+         @param alpha  the Ewald separation parameter
+         @param kmaxx  the largest wave vector in the x direction
+         @param kmaxy  the largest wave vector in the y direction
+         @param kmaxz  the largest wave vector in the z direction
+         --------------------------------------------------------------------------------------- */
+      void setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz);
+      /**---------------------------------------------------------------------------------------
+         Set the force to use Particle-Mesh Ewald (PME) summation.
+         @param alpha    the Ewald separation parameter
+         @param gridSize the dimensions of the mesh
+         --------------------------------------------------------------------------------------- */
+      void setUsePME(float alpha, int meshSize[3]);
+      /**---------------------------------------------------------------------------------------
+         Calculate Ewald ixn
+         @param numberOfAtoms    number of atoms
+         @param posq             atom coordinates and charges
+         @param atomCoordinates  atom coordinates (in format needed by PME)
+         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
+         @param exclusions       atom exclusion indices
+                                 exclusions[atomIndex] contains the list of exclusions for that atom
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+         --------------------------------------------------------------------------------------- */
+      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
+                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
+                            std::vector<RealVec>& forces, double* totalEnergy) const;
+      /**---------------------------------------------------------------------------------------
+         Calculate LJ Coulomb pair ixn
+         @param numberOfAtoms    number of atoms
+         @param posq             atom coordinates and charges
+         @param atomCoordinates  atom coordinates (periodic boundary conditions not applied)
+         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
+         @param exclusions       atom exclusion indices
+                                 exclusions[atomIndex] contains the list of exclusions for that atom
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+         @param threads          the thread pool to use
+         --------------------------------------------------------------------------------------- */
+      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
+            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+    /**
+     * This routine contains the code executed by each thread.
+     */
+    void threadComputeDirect(ThreadPool& threads, int threadIndex);
+private:
+        bool cutoff;
+        bool useSwitch;
+        bool periodic;
+        bool ewald;
+        bool pme;
+        bool tableIsValid;
+        const CpuNeighborList* neighborList;
+        float periodicBoxSize[3];
+        float cutoffDistance, switchingDistance;
+        float krf, crf;
+        float alphaEwald;
+        int numRx, numRy, numRz;
+        int meshDim[3];
+        std::vector<float> ewaldScaleTable;
+        float ewaldDX, ewaldDXInv;
+        std::vector<double> threadEnergy;
+        // The following variables are used to make information accessible to the individual threads.
+        int numberOfAtoms;
+        float* posq;
+        RealVec const* atomCoordinates;
+        std::pair<float, float> const* atomParameters;        
+        std::set<int> const* exclusions;
+        std::vector<AlignedArray<float> >* threadForce;
+        bool includeEnergy;
+        void* atomicCounter;
+        static const float TWO_OVER_SQRT_PI;
+        static const int NUM_TABLE_POINTS;
+      /**---------------------------------------------------------------------------------------
+         Calculate LJ Coulomb pair ixn between two atoms
+         @param atom1            the index of the first atom
+         @param atom2            the index of the second atom
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+         --------------------------------------------------------------------------------------- */
+      void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      /**---------------------------------------------------------------------------------------
+         Calculate all the interactions for one atom block.
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+         --------------------------------------------------------------------------------------- */
+      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      /**---------------------------------------------------------------------------------------
+         Calculate all the interactions for one atom block.
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+         --------------------------------------------------------------------------------------- */
+      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      /**
+       * Compute the displacement and squared distance between two points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+      /**
+       * Compute a fast approximation to erfc(x).
+       */
+      static fvec8 erfcApprox(fvec8 x);
+      /**
+       * Create a lookup table for the scale factor used with Ewald and PME.
+       */
+      void tabulateEwaldScaleFactor();
+      /**
+       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       */
+      fvec8 ewaldScaleFunction(fvec8 x);
+};
+} // namespace OpenMM
+// ---------------------------------------------------------------------------------------
+#endif // OPENMM_CPU_NONBONDED_FORCE_H__
--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -43,8 +43,6 @@ using namespace std;
 namespace OpenMM {
-const int CpuNeighborList::BlockSize = 4;
 class VoxelIndex 
 {
 public:
@@ -62,8 +60,8 @@ public:
 */
 class CpuNeighborList::Voxels {
 public:
-    Voxels(float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
+    Voxels(int blockSize, float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
-            voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
+            blockSize(blockSize), voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
        if (usePeriodic) {
            nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f);
            ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f);
@@ -156,7 +154,7 @@ public:
        return VoxelIndex(x, y);
    }
-    void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const {
+    void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const {
        neighbors.resize(0);
        exclusions.resize(0);
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
@@ -175,9 +173,6 @@ public:
        float centerPos[4];
        blockCenter.store(centerPos);
        VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
-        VoxelIndex atomVoxelIndex[BlockSize];
-        for (int i = 0; i < (int) blockAtoms.size(); i++)
-            atomVoxelIndex[i] = getVoxelIndex(&atomLocations[4*blockAtoms[i]]);
        int startx = centerVoxelIndex.x-dIndexX;
        int starty = centerVoxelIndex.y-dIndexY;
        int endx = centerVoxelIndex.x+dIndexX;
@@ -193,7 +188,7 @@ public:
            endx = min(endx, nx-1);
            endy = min(endy, ny-1);
        }
-        int lastSortedIndex = BlockSize*(blockIndex+1);
+        int lastSortedIndex = blockSize*(blockIndex+1);
        VoxelIndex voxelIndex(0, 0);
        for (int x = startx; x <= endx; ++x) {
            voxelIndex.x = x;
@@ -300,10 +295,12 @@ public:
                        // Add this atom to the list of neighbors.
                        neighbors.push_back(sortedAtoms[sortedIndex]);
-                        if (sortedIndex < BlockSize*blockIndex)
+                        if (sortedIndex < blockSize*blockIndex)
                            exclusions.push_back(0);
-                        else
+                        else {
-                            exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex)));
+                            int mask = (1<<blockSize)-1;
+                            exclusions.push_back(mask & (mask<<(sortedIndex-blockSize*blockIndex)));
+                        }
                    }
                }
            }
@@ -311,6 +308,7 @@ public:
    }
 private:
+    int blockSize;
    float voxelSizeX, voxelSizeY;
    float minx, maxx, miny, maxy;
    int nx, ny;
@@ -329,12 +327,12 @@ public:
    CpuNeighborList& owner;
 };
-CpuNeighborList::CpuNeighborList() {
+CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
 }
 void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
-    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
+    int numBlocks = (numAtoms+blockSize-1)/blockSize;
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
    sortedAtoms.resize(numAtoms);
@@ -381,7 +379,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
        edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
        edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
    }
-    Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
+    Voxels voxels(blockSize, edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
    for (int i = 0; i < numAtoms; i++) {
        int atomIndex = atomBins[i].second;
        sortedAtoms[i] = atomIndex;
@@ -397,9 +395,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    // Add padding atoms to fill up the last block.
-    int numPadding = numBlocks*BlockSize-numAtoms;
+    int numPadding = numBlocks*blockSize-numAtoms;
    if (numPadding > 0) {
-        char mask = (0xF0 >> numPadding) & 0xF;
+        char mask = ((0xFFFF-(1<<blockSize)+1) >> numPadding);
        for (int i = 0; i < numPadding; i++)
            sortedAtoms.push_back(0);
        vector<char>& exc = blockExclusions[blockExclusions.size()-1];
@@ -409,7 +407,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
 }
 int CpuNeighborList::getNumBlocks() const {
-    return sortedAtoms.size()/BlockSize;
+    return sortedAtoms.size()/blockSize;
 }
 const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
@@ -446,14 +444,18 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
    int numBlocks = blockNeighbors.size();
    vector<int> blockAtoms;
+    vector<VoxelIndex> atomVoxelIndex;
    for (int i = threadIndex; i < numBlocks; i += numThreads) {
        // Find the atoms in this block and compute their bounding box.
-        int firstIndex = BlockSize*i;
+        int firstIndex = blockSize*i;
-        int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
+        int atomsInBlock = min(blockSize, numAtoms-firstIndex);
        blockAtoms.resize(atomsInBlock);
-        for (int j = 0; j < atomsInBlock; j++)
+        atomVoxelIndex.resize(atomsInBlock);
+        for (int j = 0; j < atomsInBlock; j++) {
            blockAtoms[j] = sortedAtoms[firstIndex+j];
+            atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
+        }
        fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
        fvec4 maxPos = minPos;
        for (int j = 1; j < atomsInBlock; j++) {
@@ -461,7 +463,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
-        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
+        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex);
        // Record the exclusions for this block.

--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <complex>
+#include "SimTKOpenMMCommon.h"
+#include "SimTKOpenMMUtilities.h"
+#include "CpuNonbondedForceVec8.h"
+#include "ReferenceForce.h"
+#include "ReferencePME.h"
+#include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
+// In case we're using some primitive version of Visual Studio this will
+// make sure that erf() and erfc() are defined.
+#include "openmm/internal/MSVC_erfc.h"
+using namespace std;
+using namespace OpenMM;
+const float CpuNonbondedForceVec8::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
+const int CpuNonbondedForceVec8::NUM_TABLE_POINTS = 2048;
+class CpuNonbondedForceVec8::ComputeDirectTask : public ThreadPool::Task {
+public:
+    ComputeDirectTask(CpuNonbondedForceVec8& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeDirect(threads, threadIndex);
+    }
+    CpuNonbondedForceVec8& owner;
+};
+/**---------------------------------------------------------------------------------------
+   CpuNonbondedForceVec8 constructor
+   --------------------------------------------------------------------------------------- */
+CpuNonbondedForceVec8::CpuNonbondedForceVec8() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
+}
+/**---------------------------------------------------------------------------------------
+   Set the force to use a cutoff.
+   @param distance            the cutoff distance
+   @param neighbors           the neighbor list to use
+   @param solventDielectric   the dielectric constant of the bulk solvent
+     --------------------------------------------------------------------------------------- */
+void CpuNonbondedForceVec8::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
+    if (distance != cutoffDistance)
+        tableIsValid = false;
+    cutoff = true;
+    cutoffDistance = distance;
+    neighborList = &neighbors;
+    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
+    crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
+  }
+/**---------------------------------------------------------------------------------------
+   Set the force to use a switching function on the Lennard-Jones interaction.
+   @param distance            the switching distance
+   --------------------------------------------------------------------------------------- */
+void CpuNonbondedForceVec8::setUseSwitchingFunction(float distance) {
+    useSwitch = true;
+    switchingDistance = distance;
+}
+  /**---------------------------------------------------------------------------------------
+     Set the force to use periodic boundary conditions.  This requires that a cutoff has
+     also been set, and the smallest side of the periodic box is at least twice the cutoff
+     distance.
+     @param boxSize             the X, Y, and Z widths of the periodic box
+     --------------------------------------------------------------------------------------- */
+  void CpuNonbondedForceVec8::setPeriodic(float* periodicBoxSize) {
+    assert(cutoff);
+    assert(periodicBoxSize[0] >= 2*cutoffDistance);
+    assert(periodicBoxSize[1] >= 2*cutoffDistance);
+    assert(periodicBoxSize[2] >= 2*cutoffDistance);
+    periodic = true;
+    this->periodicBoxSize[0] = periodicBoxSize[0];
+    this->periodicBoxSize[1] = periodicBoxSize[1];
+    this->periodicBoxSize[2] = periodicBoxSize[2];
+  }
+  /**---------------------------------------------------------------------------------------
+     Set the force to use Ewald summation.
+     @param alpha  the Ewald separation parameter
+     @param kmaxx  the largest wave vector in the x direction
+     @param kmaxy  the largest wave vector in the y direction
+     @param kmaxz  the largest wave vector in the z direction
+     --------------------------------------------------------------------------------------- */
+  void CpuNonbondedForceVec8::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
+      alphaEwald = alpha;
+      numRx = kmaxx;
+      numRy = kmaxy;
+      numRz = kmaxz;
+      ewald = true;
+      tabulateEwaldScaleFactor();
+  }
+  /**---------------------------------------------------------------------------------------
+     Set the force to use Particle-Mesh Ewald (PME) summation.
+     @param alpha  the Ewald separation parameter
+     @param gridSize the dimensions of the mesh
+     --------------------------------------------------------------------------------------- */
+  void CpuNonbondedForceVec8::setUsePME(float alpha, int meshSize[3]) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
+      alphaEwald = alpha;
+      meshDim[0] = meshSize[0];
+      meshDim[1] = meshSize[1];
+      meshDim[2] = meshSize[2];
+      pme = true;
+      tabulateEwaldScaleFactor();
+  }
+void CpuNonbondedForceVec8::tabulateEwaldScaleFactor() {
+    if (tableIsValid)
+        return;
+    tableIsValid = true;
+    ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
+    ewaldDXInv = 1.0f/ewaldDX;
+    ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double r = i*ewaldDX;
+        double alphaR = alphaEwald*r;
+        ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
+    }
+}
+void CpuNonbondedForceVec8::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
+                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
+                                             vector<RealVec>& forces, double* totalEnergy) const {
+    typedef std::complex<float> d_complex;
+    static const float epsilon     =  1.0;
+    int kmax                            = (ewald ? std::max(numRx, std::max(numRy,numRz)) : 0);
+    float factorEwald              = -1 / (4*alphaEwald*alphaEwald);
+    float TWO_PI                   = 2.0 * PI_M;
+    float recipCoeff               = (float)(ONE_4PI_EPS0*4*PI_M/(periodicBoxSize[0] * periodicBoxSize[1] * periodicBoxSize[2]) /epsilon);
+    if (pme) {
+        pme_t pmedata;
+        RealOpenMM virial[3][3];
+        pme_init(&pmedata, alphaEwald, numberOfAtoms, meshDim, 5, 1);
+        vector<RealOpenMM> charges(numberOfAtoms);
+        for (int i = 0; i < numberOfAtoms; i++)
+            charges[i] = posq[4*i+3];
+        RealOpenMM boxSize[3] = {periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2]};
+        RealOpenMM recipEnergy = 0.0;
+        pme_exec(pmedata, atomCoordinates, forces, charges, boxSize, &recipEnergy, virial);
+        if (totalEnergy)
+            *totalEnergy += recipEnergy;
+        pme_destroy(pmedata);
+    }
+    // Ewald method
+    else if (ewald) {
+        // setup reciprocal box
+        float recipBoxSize[3] = { TWO_PI / periodicBoxSize[0], TWO_PI / periodicBoxSize[1], TWO_PI / periodicBoxSize[2]};
+        // setup K-vectors
+        #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
+        vector<d_complex> eir(kmax*numberOfAtoms*3);
+        vector<d_complex> tab_xy(numberOfAtoms);
+        vector<d_complex> tab_qxyz(numberOfAtoms);
+        for (int i = 0; (i < numberOfAtoms); i++) {
+            float* pos = posq+4*i;
+            for (int m = 0; (m < 3); m++)
+              EIR(0, i, m) = d_complex(1,0);
+            for (int m=0; (m<3); m++)
+              EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
+                                       sin(pos[m]*recipBoxSize[m]));
+            for (int j=2; (j<kmax); j++)
+              for (int m=0; (m<3); m++)
+                EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
+        }
+        // calculate reciprocal space energy and forces
+        int lowry = 0;
+        int lowrz = 1;
+        for (int rx = 0; rx < numRx; rx++) {
+            float kx = rx * recipBoxSize[0];
+            for (int ry = lowry; ry < numRy; ry++) {
+                float ky = ry * recipBoxSize[1];
+                if (ry >= 0) {
+                    for (int n = 0; n < numberOfAtoms; n++)
+                      tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
+                }
+                else {
+                    for (int n = 0; n < numberOfAtoms; n++)
+                      tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
+                }
+                for (int rz = lowrz; rz < numRz; rz++) {
+                    if (rz >= 0) {
+                        for (int n = 0; n < numberOfAtoms; n++)
+                            tab_qxyz[n] = posq[4*n+3] * (tab_xy[n] * EIR(rz, n, 2));
+                    }
+                    else {
+                        for (int n = 0; n < numberOfAtoms; n++)
+                            tab_qxyz[n] = posq[4*n+3] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
+                    }
+                    float cs = 0.0f;
+                    float ss = 0.0f;
+                    for (int n = 0; n < numberOfAtoms; n++) {
+                        cs += tab_qxyz[n].real();
+                        ss += tab_qxyz[n].imag();
+                    }
+                    float kz = rz * recipBoxSize[2];
+                    float k2 = kx * kx + ky * ky + kz * kz;
+                    float ak = exp(k2*factorEwald) / k2;
+                    for (int n = 0; n < numberOfAtoms; n++) {
+                        float force = ak * (cs * tab_qxyz[n].imag() - ss * tab_qxyz[n].real());
+                        forces[n][0] += 2 * recipCoeff * force * kx;
+                        forces[n][1] += 2 * recipCoeff * force * ky;
+                        forces[n][2] += 2 * recipCoeff * force * kz;
+                    }
+                    if (totalEnergy)
+                        *totalEnergy += recipCoeff * ak * (cs * cs + ss * ss);
+                    lowrz = 1 - numRz;
+                }
+                lowry = 1 - numRy;
+            }
+        }
+    }
+}
+void CpuNonbondedForceVec8::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
+                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+    // Record the parameters for the threads.
+    this->numberOfAtoms = numberOfAtoms;
+    this->posq = posq;
+    this->atomCoordinates = &atomCoordinates[0];
+    this->atomParameters = &atomParameters[0];
+    this->exclusions = &exclusions[0];
+    this->threadForce = &threadForce;
+    includeEnergy = (totalEnergy != NULL);
+    threadEnergy.resize(threads.getNumThreads());
+    gmx_atomic_t counter;
+    gmx_atomic_set(&counter, 0);
+    this->atomicCounter = &counter;
+    // Signal the threads to start running and wait for them to finish.
+    ComputeDirectTask task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
+    // Combine the energies from all the threads.
+    if (totalEnergy != NULL) {
+        double directEnergy = 0;
+        int numThreads = threads.getNumThreads();
+        for (int i = 0; i < numThreads; i++)
+            directEnergy += threadEnergy[i];
+        *totalEnergy += directEnergy;
+    }
+}
+void CpuNonbondedForceVec8::threadComputeDirect(ThreadPool& threads, int threadIndex) {
+    // Compute this thread's subset of interactions.
+    int numThreads = threads.getNumThreads();
+    threadEnergy[threadIndex] = 0;
+    double* energyPtr = (includeEnergy ? &threadEnergy[threadIndex] : NULL);
+    float* forces = &(*threadForce)[threadIndex][0];
+    fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
+    fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
+    if (ewald || pme) {
+        // Compute the interactions from the neighbor list.
+        while (true) {
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
+        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
+        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
+        fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
+        for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
+            fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
+            for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
+                if (*iter > i) {
+                    int j = *iter;
+                    fvec4 deltaR;
+                    fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
+                    float r2;
+                    getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
+                    float r = sqrtf(r2);
+                    float inverseR = 1/r;
+                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
+                    float alphaR = alphaEwald*r;
+                    float erfcAlphaR = erfcApprox(alphaR).lowerVec()[0];
+                    float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
+                    dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
+                    fvec4 result = deltaR*dEdR;
+                    (fvec4(forces+4*i)-result).store(forces+4*i);
+                    (fvec4(forces+4*j)+result).store(forces+4*j);
+                    if (includeEnergy)
+                        threadEnergy[threadIndex] -= chargeProd*inverseR*(1.0f-erfcAlphaR);
+                }
+            }
+        }
+    }
+    else if (cutoff) {
+        // Compute the interactions from the neighbor list.
+        while (true) {
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
+    }
+    else {
+        // Loop over all atom pairs
+        while (true) {
+            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (i >= numberOfAtoms)
+                break;
+            for (int j = i+1; j < numberOfAtoms; j++)
+                if (exclusions[j].find(i) == exclusions[j].end())
+                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
+        }
+    }
+}
+void CpuNonbondedForceVec8::calculateOneIxn(int ii, int jj, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // get deltaR, R2, and R between 2 atoms
+    fvec4 deltaR;
+    fvec4 posI(posq+4*ii);
+    fvec4 posJ(posq+4*jj);
+    float r2;
+    getDeltaR(posJ, posI, deltaR, r2, periodic, boxSize, invBoxSize);
+    if (cutoff && r2 >= cutoffDistance*cutoffDistance)
+        return;
+    float r = sqrtf(r2);
+    float inverseR = 1/r;
+    float switchValue = 1, switchDeriv = 0;
+    if (useSwitch && r > switchingDistance) {
+        float t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
+        switchValue = 1+t*t*t*(-10+t*(15-t*6));
+        switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
+    }
+    float sig       = atomParameters[ii].first + atomParameters[jj].first;
+    float sig2      = inverseR*sig;
+          sig2     *= sig2;
+    float sig6      = sig2*sig2*sig2;
+    float eps       = atomParameters[ii].second*atomParameters[jj].second;
+    float dEdR      = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
+    float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3];
+    if (cutoff)
+        dEdR += (float) (chargeProd*(inverseR-2.0f*krf*r2));
+    else
+        dEdR += (float) (chargeProd*inverseR);
+    dEdR *= inverseR*inverseR;
+    float energy = eps*(sig6-1.0f)*sig6;
+    if (useSwitch) {
+        dEdR -= energy*switchDeriv*inverseR;
+        energy *= switchValue;
+    }
+    // accumulate energies
+    if (totalEnergy) {
+        if (cutoff)
+            energy += (float) (chargeProd*(inverseR+krf*r2-crf));
+        else
+            energy += (float) (chargeProd*inverseR);
+        *totalEnergy += energy;
+    }
+    // accumulate forces
+    fvec4 result = deltaR*dEdR;
+    (fvec4(forces+4*ii)+result).store(forces+4*ii);
+    (fvec4(forces+4*jj)-result).store(forces+4*jj);
+  }
+void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    int blockAtom[8];
+    fvec4 blockAtomPosq[8];
+    fvec4 blockAtomForce[8];
+    for (int i = 0; i < 8; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        blockAtomForce[i] = fvec4(0.0f);
+    }
+    fvec8 blockAtomX = fvec8(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0], blockAtomPosq[4][0], blockAtomPosq[5][0], blockAtomPosq[6][0], blockAtomPosq[7][0]);
+    fvec8 blockAtomY = fvec8(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1], blockAtomPosq[4][1], blockAtomPosq[5][1], blockAtomPosq[6][1], blockAtomPosq[7][1]);
+    fvec8 blockAtomZ = fvec8(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2], blockAtomPosq[4][2], blockAtomPosq[5][2], blockAtomPosq[6][2], blockAtomPosq[7][2]);
+    fvec8 blockAtomCharge = fvec8(ONE_4PI_EPS0)*fvec8(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3], blockAtomPosq[4][3], blockAtomPosq[5][3], blockAtomPosq[6][3], blockAtomPosq[7][3]);
+    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
+    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    bool needPeriodic = false;
+    if (periodic) {
+        for (int i = 0; i < 8 && !needPeriodic; i++)
+            for (int j = 0; j < 3; j++)
+                if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
+                    needPeriodic = true;
+                    break;
+                }
+    }
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    // Loop over neighbors for this block.
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        int atom = neighbors[i];
+        fvec4 atomPosq(posq+4*atom);
+        // Compute the distances to the block atoms.
+        fvec8 dx, dy, dz, r2;
+        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec8 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        // Compute the interactions.
+        fvec8 r = sqrt(r2);
+        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec8 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec8 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec8 sig6 = sig2*sig2*sig2;
+            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (cutoff)
+            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+        else
+            dEdR += chargeProd*inverseR;
+        dEdR *= inverseR*inverseR;
+        // Accumulate energies.
+        if (totalEnergy) {
+            if (cutoff)
+                energy += chargeProd*(inverseR+krf*r2-crf);
+            else
+                energy += chargeProd*inverseR;
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot8(energy, 1.0f);
+        }
+        // Accumulate forces.
+        dEdR = blend(0.0f, dEdR, include);
+        fvec8 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
+        fvec4 rt[8];
+        transpose(result[0], result[1], result[2], result[3], rt[0], rt[1], rt[2], rt[3], rt[4], rt[5], rt[6], rt[7]);
+        fvec4 atomForce(forces+4*atom);
+        for (int j = 0; j < 8; j++) {
+            blockAtomForce[j] += rt[j];
+            atomForce -= rt[j];
+        }
+        atomForce.store(forces+4*atom);
+    }
+    // Record the forces on the block atoms.
+    for (int j = 0; j < 8; j++)
+        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
+  }
+void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    int blockAtom[8];
+    fvec4 blockAtomPosq[8];
+    fvec4 blockAtomForce[8];
+    for (int i = 0; i < 8; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        blockAtomForce[i] = fvec4(0.0f);
+    }
+    fvec8 blockAtomX = fvec8(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0], blockAtomPosq[4][0], blockAtomPosq[5][0], blockAtomPosq[6][0], blockAtomPosq[7][0]);
+    fvec8 blockAtomY = fvec8(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1], blockAtomPosq[4][1], blockAtomPosq[5][1], blockAtomPosq[6][1], blockAtomPosq[7][1]);
+    fvec8 blockAtomZ = fvec8(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2], blockAtomPosq[4][2], blockAtomPosq[5][2], blockAtomPosq[6][2], blockAtomPosq[7][2]);
+    fvec8 blockAtomCharge = fvec8(ONE_4PI_EPS0)*fvec8(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3], blockAtomPosq[4][3], blockAtomPosq[5][3], blockAtomPosq[6][3], blockAtomPosq[7][3]);
+    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
+    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    bool needPeriodic = false;
+    if (periodic) {
+        for (int i = 0; i < 8 && !needPeriodic; i++)
+            for (int j = 0; j < 3; j++)
+                if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
+                    needPeriodic = true;
+                    break;
+                }
+    }
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    // Loop over neighbors for this block.
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        int atom = neighbors[i];
+        fvec4 atomPosq(posq+4*atom);
+        // Compute the distances to the block atoms.
+        fvec8 dx, dy, dz, r2;
+        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec8 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        // Compute the interactions.
+        fvec8 r = sqrt(r2);
+        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec8 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec8 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec8 sig6 = sig2*sig2*sig2;
+            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
+        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
+        dEdR *= inverseR*inverseR;        
+        // Accumulate energies.
+        if (totalEnergy) {
+            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot8(energy, 1.0f);
+        }
+        // Accumulate forces.
+        dEdR = blend(0.0f, dEdR, include);
+        fvec8 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
+        fvec4 rt[8];
+        transpose(result[0], result[1], result[2], result[3], rt[0], rt[1], rt[2], rt[3], rt[4], rt[5], rt[6], rt[7]);
+        fvec4 atomForce(forces+4*atom);
+        for (int j = 0; j < 8; j++) {
+            blockAtomForce[j] += rt[j];
+            atomForce -= rt[j];
+        }
+        atomForce.store(forces+4*atom);
+    }
+    // Record the forces on the block atoms.
+    for (int j = 0; j < 8; j++)
+        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
+}
+void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    deltaR = posJ-posI;
+    if (periodic) {
+        fvec4 base = round(deltaR*invBoxSize)*boxSize;
+        deltaR = deltaR-base;
+    }
+    r2 = dot3(deltaR, deltaR);
+}
+void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+fvec8 CpuNonbondedForceVec8::erfcApprox(fvec8 x) {
+    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+    // error of 3e-7.
+    fvec8 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    t *= t;
+    t *= t;
+    t *= t;
+    return 1.0f/(t*t);
+}
+fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(fvec8 x) {
+    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+    fvec8 x1 = x*ewaldDXInv;
+    ivec8 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = x1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&ewaldScaleTable[indexLower[0]]);
+    fvec4 t2(&ewaldScaleTable[indexLower[1]]);
+    fvec4 t3(&ewaldScaleTable[indexLower[2]]);
+    fvec4 t4(&ewaldScaleTable[indexLower[3]]);
+    fvec4 t5(&ewaldScaleTable[indexUpper[0]]);
+    fvec4 t6(&ewaldScaleTable[indexUpper[1]]);
+    fvec4 t7(&ewaldScaleTable[indexUpper[2]]);
+    fvec4 t8(&ewaldScaleTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -51,6 +51,7 @@ void testNeighborList(bool periodic) {
    const int numParticles = 500;
    const float cutoff = 2.0f;
    const float boxSize[3] = {20.0f, 15.0f, 22.0f};
+    const int blockSize = 8;
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
    AlignedArray<float> positions(4*numParticles);
@@ -66,15 +67,15 @@ void testNeighborList(bool periodic) {
        }
    }
    ThreadPool threads;
-    CpuNeighborList neighborList;
+    CpuNeighborList neighborList(blockSize);
    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads);
    // Convert the neighbor list to a set for faster lookup.
    set<pair<int, int> > neighbors;
    for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
-        int blockIndex = i/CpuNeighborList::BlockSize;
+        int blockIndex = i/blockSize;
-        int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
+        int indexInBlock = i-blockIndex*blockSize;
        char mask = 1<<indexInBlock;
        for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
            if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {