Restructured nonbonded calculation to allow more efficient vectorization

56902b45 · peastman · 6eacad45 · 56902b45 · 56902b45 · 56902b45
Commit 56902b45 authored Oct 22, 2013 by peastman
7 changed files
--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -88,6 +88,27 @@ public:
    fvec4 operator-() const {
        return _mm_sub_ps(_mm_set1_ps(0.0f), val);
    }
+    fvec4 operator&(__m128i other) const {
+        return _mm_and_si128(val, other);
+    }
+    fvec4 operator==(fvec4 other) const {
+        return _mm_cmpeq_ps(val, other);
+    }
+    fvec4 operator!=(fvec4 other) const {
+        return _mm_cmpneq_ps(val, other);
+    }
+    fvec4 operator>(fvec4 other) const {
+        return _mm_cmpgt_ps(val, other);
+    }
+    fvec4 operator<(fvec4 other) const {
+        return _mm_cmplt_ps(val, other);
+    }
+    fvec4 operator>=(fvec4 other) const {
+        return _mm_cmpge_ps(val, other);
+    }
+    fvec4 operator<=(fvec4 other) const {
+        return _mm_cmple_ps(val, other);
+    }
    operator ivec4() const;
 };
@@ -174,6 +195,11 @@ static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return fvec4(_mm_max_ps(v1.val, v2.val));
 }
+static inline fvec4 abs(fvec4 v) {
+    static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    return fvec4(_mm_and_ps(v.val, mask));
+}
 static inline fvec4 sqrt(fvec4 v) {
    return fvec4(_mm_sqrt_ps(v.val));
 }
@@ -182,6 +208,10 @@ static inline float dot3(fvec4 v1, fvec4 v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
 }
+static inline float dot4(fvec4 v1, fvec4 v2) {
+    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
+}
 // Functions that operate on ivec4s.
 static inline ivec4 min(ivec4 v1, ivec4 v2) {
@@ -192,5 +222,27 @@ static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return ivec4(_mm_max_epi32(v1.val, v2.val));
 }
+static inline ivec4 abs(ivec4 v) {
+    return ivec4(_mm_abs_epi32(v.val));
+}
+// Mathematical operators involving a scalar and a vector.
+static inline fvec4 operator+(float v1, fvec4 v2) {
+    return fvec4(v1)+v2;
+}
+static inline fvec4 operator-(float v1, fvec4 v2) {
+    return fvec4(v1)-v2;
+}
+static inline fvec4 operator*(float v1, fvec4 v2) {
+    return fvec4(v1)*v2;
+}
+static inline fvec4 operator/(float v1, fvec4 v2) {
+    return fvec4(v1)/v2;
+}
 #endif /*OPENMM_VECTORIZE_H_*/
--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -13,19 +13,25 @@ class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
    class ThreadData;
    class VoxelHash;
+    static const int BlockSize;
    CpuNeighborList();
    ~CpuNeighborList();
    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance);
-    const std::vector<std::pair<int, int> >& getNeighbors();
+    int getNumBlocks() const;
+    const std::vector<int>& getSortedAtoms() const;
+    const std::vector<int>& getBlockNeighbors(int blockIndex) const;
+    const std::vector<char>& getBlockExclusions(int blockIndex) const;
    /**
     * This routine contains the code executed by each thread.
     */
-    void runThread(int index, std::vector<std::pair<int, int> >& threadNeighbors);
+    void runThread(int index);
 private:
    bool isDeleted;
    int numThreads, waitCount;
-    std::vector<std::pair<int, int> > neighbors;
+    std::vector<int> sortedAtoms;
+    std::vector<std::vector<int> > blockNeighbors;
+    std::vector<std::vector<char> > blockExclusions;
    std::vector<pthread_t> thread;
    std::vector<ThreadData*> threadData;
    pthread_cond_t startCondition, endCondition;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
 #define OPENMM_CPU_NONBONDED_FORCE_H__
+#include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
 #include "openmm/internal/vectorize.h"
 #include <pthread.h>
@@ -33,6 +34,8 @@
 #include <vector>
 // ---------------------------------------------------------------------------------------
+namespace OpenMM {
 class CpuNonbondedForce {
    public:
        class ThreadData;
@@ -63,7 +66,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
-      void setUseCutoff(float distance, const std::vector<std::pair<int, int> >& neighbors, float solventDielectric);
+      void setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric);
      /**---------------------------------------------------------------------------------------
@@ -127,9 +130,9 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
-      void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<OpenMM::RealVec>& atomCoordinates,
+      void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<RealVec>& atomCoordinates,
                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
-                            std::vector<OpenMM::RealVec>& forces, float* totalEnergy) const;
+                            std::vector<RealVec>& forces, float* totalEnergy) const;
      /**---------------------------------------------------------------------------------------
@@ -159,14 +162,14 @@ private:
        bool periodic;
        bool ewald;
        bool pme;
-        const std::vector<std::pair<int, int> >* neighborList;
+        const CpuNeighborList* neighborList;
        float periodicBoxSize[3];
        float cutoffDistance, switchingDistance;
        float krf, crf;
        float alphaEwald;
        int numRx, numRy, numRz;
        int meshDim[3];
-        std::vector<float> ewaldScaleX, ewaldScaleY, ewaldScaleDeriv;
+        std::vector<float> ewaldScaleTable;
        float ewaldDX, ewaldDXInv;
        bool isDeleted;
        int numThreads, waitCount;
@@ -199,16 +202,27 @@ private:
      /**---------------------------------------------------------------------------------------
-         Calculate LJ Coulomb pair ixn between two atoms
+         Calculate all the interactions for one atom block.
-         @param atom1            the index of the first atom
+         @param blockIndex       the index of the atom block
-         @param atom2            the index of the second atom
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+         --------------------------------------------------------------------------------------- */
+      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      /**---------------------------------------------------------------------------------------
+         Calculate all the interactions for one atom block.
+         @param blockIndex       the index of the atom block
         @param forces           force array (forces added)
         @param totalEnergy      total energy
         --------------------------------------------------------------------------------------- */
-      void calculateOneEwaldIxn(int atom1, int atom2, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
      /**
       * Compute the displacement and squared distance between two points, optionally using
@@ -219,7 +233,7 @@ private:
      /**
       * Compute a fast approximation to erfc(x).
       */
-      static float erfcApprox(float x);
+      static fvec4 erfcApprox(fvec4 x);
      /**
       * Create a lookup table for the scale factor used with Ewald and PME.
@@ -229,9 +243,11 @@ private:
      /**
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
-      float ewaldScaleFunction(float x);
+      fvec4 ewaldScaleFunction(fvec4 x);
 };
+} // namespace OpenMM
 // ---------------------------------------------------------------------------------------
 #endif // OPENMM_CPU_NONBONDED_FORCE_H__
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -234,7 +234,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
            lastPositions = posData;
        }
-        nonbonded.setUseCutoff(nonbondedCutoff, neighborList.getNeighbors(), rfDielectric);
+        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
    }
    if (periodic || ewald || pme) {
        double minAllowedSize = 1.999999*nonbondedCutoff;

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
 #include "CpuNeighborList.h"
 #include "openmm/internal/hardware.h"
 #include "openmm/internal/vectorize.h"
+#include "hilbert.h"
 #include <algorithm>
 #include <set>
 #include <map>
@@ -11,6 +12,8 @@ using namespace std;
 namespace OpenMM {
+const int CpuNeighborList::BlockSize = 4;
 class VoxelIndex 
 {
 public:
@@ -77,23 +80,22 @@ public:
        return VoxelIndex(x, y, z);
    }
-    void getNeighbors(vector<pair<int, int> >& neighbors, const VoxelItem& referencePoint, const vector<set<int> >& exclusions, float maxDistance) const {
+    void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const {
+        neighbors.resize(0);
-        // Loop over neighboring voxels
+        exclusions.resize(0);
-        // TODO use more clever selection of neighboring voxels
-        const int atomI = referencePoint.second;
-        const float* locationI = referencePoint.first;
-        fvec4 posI(locationI);
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
        fvec4 invBoxSize(1/periodicBoxSize[0], 1/periodicBoxSize[1], 1/periodicBoxSize[2], 0);
        float maxDistanceSquared = maxDistance * maxDistance;
+        float refineCutoff = maxDistance-max(max(blockWidth[0], blockWidth[1]), blockWidth[2]);
-        int dIndexX = int(maxDistance / voxelSizeX) + 1; // How may voxels away do we have to look?
+        float refineCutoffSquared = refineCutoff*refineCutoff;
-        int dIndexY = int(maxDistance / voxelSizeY) + 1;
-        int dIndexZ = int(maxDistance / voxelSizeZ) + 1;
+        int dIndexX = int((maxDistance+blockWidth[0])/voxelSizeX) + 1; // How may voxels away do we have to look?
-        VoxelIndex centerVoxelIndex = getVoxelIndex(locationI);
+        int dIndexY = int((maxDistance+blockWidth[1])/voxelSizeY) + 1;
+        int dIndexZ = int((maxDistance+blockWidth[2])/voxelSizeZ) + 1;
+        float centerPos[4];
+        blockCenter.store(centerPos);
+        VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
        int lastx = centerVoxelIndex.x+dIndexX;
        int lasty = centerVoxelIndex.y+dIndexY;
        int lastz = centerVoxelIndex.z+dIndexZ;
@@ -102,6 +104,7 @@ public:
            lasty = min(lasty, centerVoxelIndex.y-dIndexY+ny-1);
            lastz = min(lastz, centerVoxelIndex.z-dIndexZ+nz-1);
        }
+        int lastSortedIndex = BlockSize*(blockIndex+1);
        VoxelIndex voxelIndex(0, 0, 0);
        for (int x = centerVoxelIndex.x - dIndexX; x <= lastx; ++x) {
            voxelIndex.x = x;
@@ -120,27 +123,52 @@ public:
                        continue; // no such voxel; skip
                    const Voxel& voxel = voxelEntry->second;
                    for (Voxel::const_iterator itemIter = voxel.begin(); itemIter != voxel.end(); ++itemIter) {
-                        const int atomJ = itemIter->second;
+                        const int sortedIndex = itemIter->second;
                        // Avoid duplicate entries.
-                        if (atomJ >= atomI)
+                        if (sortedIndex >= lastSortedIndex)
                            break;
-                        fvec4 posJ(itemIter->first);
+                        fvec4 atomPos(itemIter->first);
-                        fvec4 delta = posJ-posI;
+                        fvec4 delta = atomPos-centerPos;
                        if (usePeriodic) {
                            fvec4 base = round(delta*invBoxSize)*boxSize;
                            delta = delta-base;
                        }
+                        delta = max(0.0f, abs(delta)-blockWidth);
                        float dSquared = dot3(delta, delta);
                        if (dSquared > maxDistanceSquared)
                            continue;
-                        // Ignore exclusions.
+                        if (dSquared > refineCutoffSquared) {
-                        if (exclusions[atomI].find(atomJ) != exclusions[atomI].end())
+                            // The distance is large enough that there might not be any actual interactions.
+                            // Check individual atom pairs to be sure.
+                            bool any = false;
+                            for (int k = 0; k < (int) blockAtoms.size(); k++) {
+                                fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
+                                delta = atomPos-pos1;
+                                if (usePeriodic) {
+                                    fvec4 base = round(delta*invBoxSize)*boxSize;
+                                    delta = delta-base;
+                                }
+                                float r2 = dot3(delta, delta);
+                                if (r2 < maxDistanceSquared) {
+                                    any = true;
+                                    break;
+                                }
+                            }
+                            if (!any)
                                continue;
+                        }
+                        // Add this atom to the list of neighbors.
-                        neighbors.push_back(make_pair(atomI, atomJ));
+                        neighbors.push_back(sortedAtoms[sortedIndex]);
+                        if (sortedIndex < BlockSize*blockIndex)
+                            exclusions.push_back(0);
+                        else
+                            exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex)));
                    }
                }
            }
@@ -161,12 +189,11 @@ public:
    }
    int index;
    CpuNeighborList& owner;
-    vector<pair<int, int> > threadNeighbors;
 };
 static void* threadBody(void* args) {
    CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args);
-    data.owner.runThread(data.index, data.threadNeighbors);
+    data.owner.runThread(data.index);
    delete &data;
    return 0;
 }
@@ -204,6 +231,45 @@ CpuNeighborList::~CpuNeighborList() {
 void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance) {
+    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
+    blockNeighbors.resize(numBlocks);
+    blockExclusions.resize(numBlocks);
+    sortedAtoms.resize(numAtoms);
+    // Sort the atoms based on a Hilbert curve.
+    float minx = atomLocations[0], maxx = atomLocations[0];
+    float miny = atomLocations[1], maxy = atomLocations[1];
+    float minz = atomLocations[2], maxz = atomLocations[2];
+    for (int i = 0; i < numAtoms; i++) {
+        const float* pos = &atomLocations[4*i];
+        if (pos[0] < minx)
+            minx = pos[0];
+        if (pos[1] < miny)
+            miny = pos[1];
+        if (pos[2] < minz)
+            minz = pos[2];
+        if (pos[0] > maxx)
+            maxx = pos[0];
+        if (pos[1] > maxy)
+            maxy = pos[1];
+        if (pos[2] > maxz)
+            maxz = pos[2];
+    }
+    float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
+    float invBinWidth = 1.0f/binWidth;
+    vector<pair<int, int> > atomBins(numAtoms);
+    bitmask_t coords[3];
+    for (int i = 0; i < numAtoms; i++) {
+        const float* pos = &atomLocations[4*i];
+        coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
+        coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
+        coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
+        int bin = (int) hilbert_c2i(3, 8, coords);
+        atomBins[i] = pair<int, int>(bin, i);
+    }
+    sort(atomBins.begin(), atomBins.end());
    // Build the voxel hash.
    float edgeSizeX, edgeSizeY, edgeSizeZ;
@@ -215,8 +281,11 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
        edgeSizeZ = 0.5f*periodicBoxSize[2]/floorf(periodicBoxSize[2]/maxDistance);
    }
    VoxelHash voxelHash(edgeSizeX, edgeSizeY, edgeSizeZ, periodicBoxSize, usePeriodic);
-    for (int i = 0; i < numAtoms; i++)
+    for (int i = 0; i < numAtoms; i++) {
-        voxelHash.insert(i, &atomLocations[4*i]);
+        int atomIndex = atomBins[i].second;
+        sortedAtoms[i] = atomIndex;
+        voxelHash.insert(i, &atomLocations[4*atomIndex]);
+    }
    // Record the parameters for the threads.
@@ -237,18 +306,37 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
        pthread_cond_wait(&endCondition, &lock);
    pthread_mutex_unlock(&lock);
-    // Combine the results from all the threads.
+    // Add padding atoms to fill up the last block.
-    neighbors.clear();
+    int numPadding = numBlocks*BlockSize-numAtoms;
-    for (int i = 0; i < numThreads; i++)
+    if (numPadding > 0) {
-        neighbors.insert(neighbors.end(), threadData[i]->threadNeighbors.begin(), threadData[i]->threadNeighbors.end());
+        char mask = (0xF0 >> numPadding) & 0xF;
+        for (int i = 0; i < numPadding; i++)
+            sortedAtoms.push_back(0);
+        vector<char>& exc = blockExclusions[blockExclusions.size()-1];
+        for (int i = 0; i < (int) exc.size(); i++)
+            exc[i] |= mask;
+    }
+}
+int CpuNeighborList::getNumBlocks() const {
+    return sortedAtoms.size()/BlockSize;
+}
+const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
+    return sortedAtoms;
 }
-const vector<pair<int, int> >& CpuNeighborList::getNeighbors() {
+const std::vector<int>& CpuNeighborList::getBlockNeighbors(int blockIndex) const {
-    return neighbors;
+    return blockNeighbors[blockIndex];
 }
-void CpuNeighborList::runThread(int index, vector<pair<int, int> >& threadNeighbors) {
+const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) const {
+    return blockExclusions[blockIndex];
+}
+void CpuNeighborList::runThread(int index) {
    while (true) {
        // Wait for the signal to start running.
@@ -262,9 +350,41 @@ void CpuNeighborList::runThread(int index, vector<pair<int, int> >& threadNeighb
        // Compute this thread's subset of neighbors.
-        threadNeighbors.clear();
+        int numBlocks = blockNeighbors.size();
-        for (int i = index; i < numAtoms; i += numThreads)
+        vector<int> blockAtoms;
-            voxelHash->getNeighbors(threadNeighbors, VoxelItem(&atomLocations[4*i], i), *exclusions, maxDistance);
+        for (int i = index; i < numBlocks; i += numThreads) {
+            {
+            int firstIndex = BlockSize*i;
+            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
+            blockAtoms.resize(atomsInBlock);
+            for (int j = 0; j < atomsInBlock; j++)
+                blockAtoms[j] = sortedAtoms[firstIndex+j];
+            }
+            int firstIndex = BlockSize*i;
+            fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
+            fvec4 maxPos = minPos;
+            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
+            for (int j = 1; j < atomsInBlock; j++) {
+                fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
+                minPos = min(minPos, pos);
+                maxPos = max(maxPos, pos);
+            }
+            voxelHash->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
+            // Record the exclusions for this block.
+            for (int j = 0; j < atomsInBlock; j++) {
+                const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
+                char mask = 1<<j;
+                for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
+                    int atomIndex = blockNeighbors[i][k];
+                    if (atomExclusions.find(atomIndex) != atomExclusions.end())
+                        blockExclusions[i][k] |= mask;
+                }
+            }
+        }
    }
 }

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -114,7 +114,7 @@ CpuNonbondedForce::~CpuNonbondedForce(){
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setUseCutoff(float distance, const vector<pair<int, int> >& neighbors, float solventDielectric) {
+  void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
    cutoff = true;
    cutoffDistance = distance;
@@ -200,23 +200,22 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
 void CpuNonbondedForce::tabulateEwaldScaleFactor() {
    ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
    ewaldDXInv = 1.0f/ewaldDX;
-    vector<double> x(NUM_TABLE_POINTS);
+    vector<double> x(NUM_TABLE_POINTS+1);
-    vector<double> y(NUM_TABLE_POINTS);
+    vector<double> y(NUM_TABLE_POINTS+1);
    vector<double> deriv;
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
+    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
        double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
        double alphaR = alphaEwald*r;
        x[i] = r;
        y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
    }
    SplineFitter::createNaturalSpline(x, y, deriv);
-    ewaldScaleX.resize(NUM_TABLE_POINTS);
+    ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
-    ewaldScaleY.resize(NUM_TABLE_POINTS);
-    ewaldScaleDeriv.resize(NUM_TABLE_POINTS);
    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        ewaldScaleX[i] = (float) x[i];
+        ewaldScaleTable[4*i] = (float) y[i];
-        ewaldScaleY[i] = (float) y[i];
+        ewaldScaleTable[4*i+1] = (float) y[i+1];
-        ewaldScaleDeriv[i] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
+        ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
+        ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
    }
 }
@@ -384,7 +383,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
                    float inverseR  = 1/r;
                    float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3];
                    float alphaR = alphaEwald*r;
-                    float erfcAlphaR = erfcApprox(alphaR);
+                    float erfcAlphaR = erfcApprox(alphaR)[0];
                    float dEdR      = (float) (chargeProd * inverseR * inverseR * inverseR);
                          dEdR      = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
                    fvec4 result = deltaR*dEdR;
@@ -424,18 +423,14 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double&
        if (ewald || pme) {
            // Compute the interactions from the neighbor list.
-            for (int i = index; i < (int) neighborList->size(); i += numThreads) {
+            for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
-                pair<int, int> pair = (*neighborList)[i];
+                calculateBlockEwaldIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
-                calculateOneEwaldIxn(pair.first, pair.second, &threadForce[0], energyPtr, boxSize, invBoxSize);
-            }
        }
        else if (cutoff) {
            // Compute the interactions from the neighbor list.
-            for (int i = index; i < (int) neighborList->size(); i += numThreads) {
+            for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
-                pair<int, int> pair = (*neighborList)[i];
+                calculateBlockIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
-                calculateOneIxn(pair.first, pair.second, &threadForce[0], energyPtr, boxSize, invBoxSize);
-            }
        }
        else {
            // Loop over all atom pairs
@@ -503,49 +498,196 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
  }
-void CpuNonbondedForce::calculateOneEwaldIxn(int ii, int jj, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    fvec4 deltaR;
+    // Load the positions and parameters of the atoms in the block.
-    fvec4 posI(posq+4*ii);
-    fvec4 posJ(posq+4*jj);
+    int blockAtom[4];
-    float r2;
+    fvec4 blockAtomPosq[4];
-    getDeltaR(posJ, posI, deltaR, r2, true, boxSize, invBoxSize);
+    fvec4 blockAtomForce[4];
-    if (r2 < cutoffDistance*cutoffDistance) {
+    for (int i = 0; i < 4; i++) {
-        float r         = sqrtf(r2);
+        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
-        float inverseR  = 1/r;
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        float switchValue = 1, switchDeriv = 0;
+        blockAtomForce[i] = fvec4(0.0f);
-        if (useSwitch && r > switchingDistance) {
-            float t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
-            switchValue = 1+t*t*t*(-10+t*(15-t*6));
-            switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
    }
-        float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3];
+    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-        float dEdR      = chargeProd*inverseR*ewaldScaleFunction(r);
+    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-        float sig       = atomParameters[ii].first +  atomParameters[jj].first;
+    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-        float sig2      = inverseR*sig;
+    // Loop over neighbors for this block.
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    float blockAtomR2[4];
+    bool include[4];
+    fvec4 blockAtomDelta[4];
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        int atom = neighbors[i];
+        fvec4 atomPosq(posq+4*atom);
+        // Compute the distances to the block atoms.
+        bool any = false;
+        for (int j = 0; j < 4; j++) {
+            getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
+            include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || blockAtomR2[j] < cutoffDistance*cutoffDistance));
+            any |= include[j];
+        }
+        if (!any)
+            continue; // No interactions to compute.
+        // Compute the interactions.
+        fvec4 r2(blockAtomR2);
+        fvec4 r = sqrt(r2);
+        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 switchValue(1.0f), switchDeriv(0.0f);
+        if (useSwitch) {
+            fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
+            switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+            switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
+        }
+        fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+        fvec4 sig2 = inverseR*sig;
        sig2 *= sig2;
-        float sig6      = sig2*sig2*sig2;
+        fvec4 sig6 = sig2*sig2*sig2;
-        float eps       = atomParameters[ii].second*atomParameters[jj].second;
+        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
-              dEdR     += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
+        fvec4 dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (cutoff)
+            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+        else
+            dEdR += chargeProd*inverseR;
        dEdR *= inverseR*inverseR;
-        float energy = eps*(sig6-1.0f)*sig6;
+        fvec4 energy = eps*(sig6-1.0f)*sig6;
        if (useSwitch) {
            dEdR -= energy*switchDeriv*inverseR;
            energy *= switchValue;
        }
-        // accumulate forces
+        // Accumulate energies.
-        fvec4 result = deltaR*dEdR;
+        if (totalEnergy) {
-        (fvec4(forces+4*ii)+result).store(forces+4*ii);
+            if (cutoff)
-        (fvec4(forces+4*jj)-result).store(forces+4*jj);
+                energy += chargeProd*(inverseR+krf*r2-crf);
+            else
+                energy += chargeProd*inverseR;
+            for (int j = 0; j < 4; j++)
+                if (include[j])
+                    *totalEnergy += energy[j];
+        }
-        // accumulate energies
+        // Accumulate forces.
+        fvec4 atomForce(forces+4*atom);
+        for (int j = 0; j < 4; j++) {
+            if (include[j]) {
+                fvec4 result = blockAtomDelta[j]*dEdR[j];
+                blockAtomForce[j] += result;
+                atomForce -= result;
+            }
+        }
+        atomForce.store(forces+4*atom);
+    }
+    // Record the forces on the block atoms.
+    for (int j = 0; j < 4; j++)
+        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
+  }
+void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    int blockAtom[4];
+    fvec4 blockAtomPosq[4];
+    fvec4 blockAtomForce[4];
+    for (int i = 0; i < 4; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        blockAtomForce[i] = fvec4(0.0f);
+    }
+    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
+    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
+    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    // Loop over neighbors for this block.
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    float blockAtomR2[4];
+    bool include[4];
+    fvec4 blockAtomDelta[4];
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        int atom = neighbors[i];
+        fvec4 atomPosq(posq+4*atom);
+        // Compute the distances to the block atoms.
+        bool any = false;
+        for (int j = 0; j < 4; j++) {
+            getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
+            include[j] = (((exclusions[i]>>j)&1) == 0 && blockAtomR2[j] < cutoffDistance*cutoffDistance);
+            any |= include[j];
+        }
+        if (!any)
+            continue; // No interactions to compute.
+        // Compute the interactions.
+        fvec4 r2(blockAtomR2);
+        fvec4 r = sqrt(r2);
+        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 switchValue(1.0f), switchDeriv(0.0f);
+        if (useSwitch) {
+            fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
+            switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+            switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
+        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        fvec4 dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
+        fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+        fvec4 sig2 = inverseR*sig;
+        sig2 *= sig2;
+        fvec4 sig6 = sig2*sig2*sig2;
+        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
+        dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
+        dEdR *= inverseR*inverseR;
+        fvec4 energy = eps*(sig6-1.0f)*sig6;
+        if (useSwitch) {
+            dEdR -= energy*switchDeriv*inverseR;
+            energy *= switchValue;
+        }
+        // Accumulate energies.
        if (totalEnergy) {
-            energy += (float) (chargeProd*inverseR*erfcApprox(alphaEwald*r));
+            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
-            *totalEnergy += energy;
+            for (int j = 0; j < 4; j++)
+                if (include[j])
+                    *totalEnergy += energy[j];
        }
+        // Accumulate forces.
+        fvec4 atomForce(forces+4*atom);
+        for (int j = 0; j < 4; j++) {
+            if (include[j]) {
+                fvec4 result = blockAtomDelta[j]*dEdR[j];
+                blockAtomForce[j] += result;
+                atomForce -= result;
            }
+        }
+        atomForce.store(forces+4*atom);
+    }
+    // Record the forces on the block atoms.
+    for (int j = 0; j < 4; j++)
+        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
 }
 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
@@ -557,24 +699,33 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
    r2 = dot3(deltaR, deltaR);
 }
-float CpuNonbondedForce::erfcApprox(float x) {
+fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
    // error of 3e-7.
-    float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
    t *= t;
    t *= t;
    t *= t;
    return 1.0f/(t*t);
 }
-float CpuNonbondedForce::ewaldScaleFunction(float x) {
+fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-    int lower = (int) (x*ewaldDXInv);
+    float y[4];
-    int upper = lower+1;
+    fvec4 x1 = x*ewaldDXInv;
-    float a = (ewaldScaleX[upper]-x)*ewaldDXInv;
+    ivec4 index = floor(x1);
-    float b = 1.0f-a;
+    fvec4 coeff[4];
-    return a*ewaldScaleY[lower]+b*ewaldScaleY[upper]+((a*a*a-a)*ewaldScaleDeriv[lower] + (b*b*b-b)*ewaldScaleDeriv[upper]);
+    coeff[1] = x1-index;
+    coeff[0] = 1.0f-coeff[1];
+    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
+    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
+    _MM_TRANSPOSE4_PS(coeff[0], coeff[1], coeff[2], coeff[3]);
+    for (int i = 0; i < 4; i++) {
+        if (index[i] < NUM_TABLE_POINTS)
+            y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
+    }
+    return fvec4(y);
 }
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -39,6 +39,7 @@
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <set>
+#include <utility>
 #include <vector>
 using namespace OpenMM;
@@ -68,11 +69,20 @@ void testNeighborList(bool periodic) {
    // Convert the neighbor list to a set for faster lookup.
    set<pair<int, int> > neighbors;
-    for (int i = 0; i < (int) neighborList.getNeighbors().size(); i++) {
+    for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
-        pair<int, int> entry = neighborList.getNeighbors()[i];
+        int blockIndex = i/CpuNeighborList::BlockSize;
+        int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
+        char mask = 1<<indexInBlock;
+        for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
+            if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {
+                int atom1 = neighborList.getSortedAtoms()[i];
+                int atom2 = neighborList.getBlockNeighbors(blockIndex)[j];
+                pair<int, int> entry = make_pair(min(atom1, atom2), max(atom1, atom2));
                ASSERT(neighbors.find(entry) == neighbors.end() && neighbors.find(make_pair(entry.second, entry.first)) == neighbors.end()); // No duplicates
                neighbors.insert(entry);
            }
+        }
+    }
    // Check each particle pair and figure out whether they should be in the neighbor list.
@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) {
            if (dx*dx + dy*dy + dz*dz > cutoff*cutoff)
                shouldInclude = false;
            bool isIncluded = (neighbors.find(make_pair(i, j)) != neighbors.end() || neighbors.find(make_pair(j, i)) != neighbors.end());
-            ASSERT_EQUAL(shouldInclude, isIncluded);
+            if (shouldInclude)
+                ASSERT(isIncluded);
        }
 }