Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · 93c467b2 · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -95,8 +95,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
    if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
        // Select the new step size.

-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
@@ -108,9 +108,9 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error

        // Recalculate the integration parameters.

-        mixed vscale = exp(-newStepSize/tau);
+        mixed vscale = EXP(-newStepSize/tau);
        mixed fscale = (1-vscale)*tau;
-        mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
+        mixed noisescale = SQRT(2*kT/tau)*SQRT(0.5f*(1-vscale*vscale)*tau);
        params[VelScale] = vscale;
        params[ForceScale] = fscale;
        params[NoiseScale] = noisescale;

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
-#define TILE_SIZE 32
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

 typedef struct {
@@ -15,133 +14,245 @@ typedef struct {
 * Compute nonbonded interactions.
 */
 extern "C" __global__ void computeNonbonded(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
-        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
-        unsigned int startTileIndex, unsigned int numTileIndices
+        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
+        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
-        , const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+        , const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
 #endif
        PARAMETER_ARGUMENTS) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    const unsigned int numTiles = interactionCount[0];
-    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
-    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
-#else
-    const unsigned int numTiles = numTileIndices;
-    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
-    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
-#endif
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    const unsigned int tbx = threadIdx.x - tgx;
    real energy = 0.0f;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
-#ifndef ENABLE_SHUFFLE
-    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
-#endif
+
+    // First loop: process tiles that contain exclusions.
    
-    do {
-        // Extract the coordinates of this tile
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-        const unsigned int tbx = threadIdx.x - tgx;
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
-        unsigned int x, y;
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
        real3 force = make_real3(0);
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
 #endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-            real4 posq1 = posq[atom1];
-            LOAD_ATOM1_PARAMETERS
-
-            // Locate the exclusion data for this tile.
+        const bool hasExclusions = true;
+        if (x == y) {
+            // This tile is on the diagonal.

+            const unsigned int localAtomIndex = threadIdx.x;
+            localData[localAtomIndex].x = posq1.x;
+            localData[localAtomIndex].y = posq1.y;
+            localData[localAtomIndex].z = posq1.z;
+            localData[localAtomIndex].q = posq1.w;
+            LOAD_LOCAL_PARAMETERS_FROM_1
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+j;
+                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                real invR = RSQRT(r2);
+                real r = RECIP(invR);
+                LOAD_ATOM2_PARAMETERS
+                atom2 = y*TILE_SIZE+j;
+#ifdef USE_SYMMETRIC
+                real dEdR = 0.0f;
+#else
+                real3 dEdR1 = make_real3(0);
+                real3 dEdR2 = make_real3(0);
+#endif
 #ifdef USE_EXCLUSIONS
-            if (tgx < 2)
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+#endif
+                real tempEnergy = 0.0f;
+                COMPUTE_INTERACTION
+                energy += 0.5f*tempEnergy;
+#ifdef USE_SYMMETRIC
+                force.x -= delta.x*dEdR;
+                force.y -= delta.y*dEdR;
+                force.z -= delta.z*dEdR;
 #else
-            bool hasExclusions = false;
+                force.x -= dEdR1.x;
+                force.y -= dEdR1.y;
+                force.z -= dEdR1.z;
 #endif
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-
-                const unsigned int localAtomIndex = threadIdx.x;
-                localData[localAtomIndex].x = posq1.x;
-                localData[localAtomIndex].y = posq1.y;
-                localData[localAtomIndex].z = posq1.z;
-                localData[localAtomIndex].q = posq1.w;
-                LOAD_LOCAL_PARAMETERS_FROM_1
 #ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+                excl >>= 1;
 #endif
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            
+            const unsigned int localAtomIndex = threadIdx.x;
+            unsigned int j = y*TILE_SIZE + tgx;
+            real4 tempPosq = posq[j];
+            localData[localAtomIndex].x = tempPosq.x;
+            localData[localAtomIndex].y = tempPosq.y;
+            localData[localAtomIndex].z = tempPosq.z;
+            localData[localAtomIndex].q = tempPosq.w;
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            localData[localAtomIndex].fx = 0.0f;
+            localData[localAtomIndex].fy = 0.0f;
+            localData[localAtomIndex].fz = 0.0f;
 #ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
 #endif
-                    int atom2 = tbx+j;
-                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+tj;
+                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < CUTOFF_SQUARED) {
 #endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    real invR = RSQRT(r2);
                    real r = RECIP(invR);
                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
+                    atom2 = y*TILE_SIZE+tj;
 #ifdef USE_SYMMETRIC
                    real dEdR = 0.0f;
 #else
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
 #endif
                    real tempEnergy = 0.0f;
                    COMPUTE_INTERACTION
-                    energy += 0.5f*tempEnergy;
+                    energy += tempEnergy;
 #ifdef USE_SYMMETRIC
-                    force.x -= delta.x*dEdR;
-                    force.y -= delta.y*dEdR;
-                    force.z -= delta.z*dEdR;
+                    delta *= dEdR;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
+                    localData[tbx+tj].fx += delta.x;
+                    localData[tbx+tj].fy += delta.y;
+                    localData[tbx+tj].fz += delta.z;
 #else
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
+                    localData[tbx+tj].fx += dEdR2.x;
+                    localData[tbx+tj].fy += dEdR2.y;
+                    localData[tbx+tj].fz += dEdR2.z;
+#endif
+#ifdef USE_CUTOFF
+                }
 #endif
 #ifdef USE_EXCLUSIONS
-                    excl >>= 1;
+                excl >>= 1;
+#endif
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+        }
+
+        // Write results.
+
+        unsigned int offset = x*TILE_SIZE + tgx;
+        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+        }
+    }
+
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+
+#ifdef USE_CUTOFF
+    const unsigned int numTiles = interactionCount[0];
+    int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+#else
+    const unsigned int numTiles = numTileIndices;
+    int pos = startTileIndex+warp*numTiles/totalWarps;
+    int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
+    
+    while (pos < end) {
+        const bool hasExclusions = false;
+        real3 force = make_real3(0);
+        bool includeTile = true;
+
+        // Extract the coordinates of this tile.
+        
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
 #endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+
+            // Skip over tiles that have exclusions, since they were already processed.
+
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
            }
-            else {
-                // This is an off-diagonal tile.
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;

-                const unsigned int localAtomIndex = threadIdx.x;
-                unsigned int j = y*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = threadIdx.x;
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[threadIdx.x] = j;
+            if (j < PADDED_NUM_ATOMS) {
                real4 tempPosq = posq[j];
                localData[localAtomIndex].x = tempPosq.x;
                localData[localAtomIndex].y = tempPosq.y;
@@ -151,195 +262,137 @@ extern "C" __global__ void computeNonbonded(
                localData[localAtomIndex].fx = 0.0f;
                localData[localAtomIndex].fy = 0.0f;
                localData[localAtomIndex].fz = 0.0f;
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                if (!hasExclusions && flags != 0xFFFFFFFF) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.

-                        for (j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                bool isExcluded = false;
-                                int atom2 = tbx+j;
-                                int bufferIndex = 3*threadIdx.x;
+                real4 blockCenterX = blockCenter[x];
+                posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    if (r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
 #ifdef USE_SYMMETRIC
-                                real dEdR = 0;
+                        real dEdR = 0.0f;
 #else
-                                real3 dEdR1 = make_real3(0);
-                                real3 dEdR2 = make_real3(0);
-#endif
-                                real tempEnergy = 0.0f;
-                                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
-#ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                                if (r2 < CUTOFF_SQUARED) {
+                        real3 dEdR1 = make_real3(0);
+                        real3 dEdR2 = make_real3(0);
 #endif
-                                    real invR = RSQRT(r2);
-                                    real r = RECIP(invR);
-                                    LOAD_ATOM2_PARAMETERS
-                                    atom2 = y*TILE_SIZE+j;
-                                    COMPUTE_INTERACTION
-                                    energy += tempEnergy;
-#ifdef USE_CUTOFF
-                                }
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
 #endif
-#ifdef ENABLE_SHUFFLE
-    #ifdef USE_SYMMETRIC
-                                delta *= dEdR;
-                                force.x -= delta.x;
-                                force.y -= delta.y;
-                                force.z -= delta.z;
-                                for (int i = 16; i >= 1; i /= 2) {
-                                    delta.x += __shfl_xor(delta.x, i, 32);
-                                    delta.y += __shfl_xor(delta.y, i, 32);
-                                    delta.z += __shfl_xor(delta.z, i, 32);
-                                }
-                                if (tgx == 0) {
-                                    localData[tbx+j].fx += delta.x;
-                                    localData[tbx+j].fy += delta.y;
-                                    localData[tbx+j].fz += delta.z;
-                                }
-    #else
-                                force.x -= dEdR1.x;
-                                force.y -= dEdR1.y;
-                                force.z -= dEdR1.z;
-                                for (int i = 16; i >= 1; i /= 2) {
-                                    dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
-                                    dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
-                                    dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
-                                }
-                                if (tgx == 0) {
-                                    localData[tbx+j].fx += dEdR2.x;
-                                    localData[tbx+j].fy += dEdR2.y;
-                                    localData[tbx+j].fz += dEdR2.z;
-                                }
-    #endif
+                        real tempEnergy = 0.0f;
+                        COMPUTE_INTERACTION
+                        energy += tempEnergy;
+#ifdef USE_SYMMETRIC
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        localData[tbx+tj].fx += delta.x;
+                        localData[tbx+tj].fy += delta.y;
+                        localData[tbx+tj].fz += delta.z;
 #else
-    #ifdef USE_SYMMETRIC
-                                delta *= dEdR;
-                                force.x -= delta.x;
-                                force.y -= delta.y;
-                                force.z -= delta.z;
-                                tempBuffer[bufferIndex] = delta.x;
-                                tempBuffer[bufferIndex+1] = delta.y;
-                                tempBuffer[bufferIndex+2] = delta.z;
-    #else
-                                force.x -= dEdR1.x;
-                                force.y -= dEdR1.y;
-                                force.z -= dEdR1.z;
-                                tempBuffer[bufferIndex] = dEdR2.x;
-                                tempBuffer[bufferIndex+1] = dEdR2.y;
-                                tempBuffer[bufferIndex+2] = dEdR2.z;
-    #endif
-
-                                // Sum the forces on atom2.
-
-                                if (tgx % 4 == 0) {
-                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
-                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
-                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
-                                }
-                                if (tgx == 0) {
-                                    localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                }
+                        force.x -= dEdR1.x;
+                        force.y -= dEdR1.y;
+                        force.z -= dEdR1.z;
+                        localData[tbx+tj].fx += dEdR2.x;
+                        localData[tbx+tj].fy += dEdR2.y;
+                        localData[tbx+tj].fz += dEdR2.z;
 #endif
-                            }
-                        }
                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
-                else
+            }
+            else
 #endif
-                {
-                    // Compute the full set of interactions in this tile.
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.

-#ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-#endif
-                    unsigned int tj = tgx;
-                    for (j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
-                        int atom2 = tbx+tj;
-                        real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < CUTOFF_SQUARED) {
 #endif
-                            real invR = RSQRT(r2);
-                            real r = RECIP(invR);
-                            LOAD_ATOM2_PARAMETERS
-                            atom2 = y*TILE_SIZE+tj;
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
 #ifdef USE_SYMMETRIC
-                            real dEdR = 0.0f;
+                        real dEdR = 0.0f;
 #else
-                            real3 dEdR1 = make_real3(0);
-                            real3 dEdR2 = make_real3(0);
+                        real3 dEdR1 = make_real3(0);
+                        real3 dEdR2 = make_real3(0);
 #endif
-                            real tempEnergy = 0.0f;
-                            COMPUTE_INTERACTION
-                            energy += tempEnergy;
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
+#endif
+                        real tempEnergy = 0.0f;
+                        COMPUTE_INTERACTION
+                        energy += tempEnergy;
 #ifdef USE_SYMMETRIC
-                            delta *= dEdR;
-                            force.x -= delta.x;
-                            force.y -= delta.y;
-                            force.z -= delta.z;
-                            localData[tbx+tj].fx += delta.x;
-                            localData[tbx+tj].fy += delta.y;
-                            localData[tbx+tj].fz += delta.z;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        localData[tbx+tj].fx += delta.x;
+                        localData[tbx+tj].fy += delta.y;
+                        localData[tbx+tj].fz += delta.z;
 #else
-                            force.x -= dEdR1.x;
-                            force.y -= dEdR1.y;
-                            force.z -= dEdR1.z;
-                            localData[tbx+tj].fx += dEdR2.x;
-                            localData[tbx+tj].fy += dEdR2.y;
-                            localData[tbx+tj].fz += dEdR2.z;
+                        force.x -= dEdR1.x;
+                        force.y -= dEdR1.y;
+                        force.z -= dEdR1.z;
+                        localData[tbx+tj].fx += dEdR2.x;
+                        localData[tbx+tj].fy += dEdR2.y;
+                        localData[tbx+tj].fz += dEdR2.z;
 #endif
 #ifdef USE_CUTOFF
-                        }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
-#endif
-                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
-        }
-        
-        // Write results.
-        
-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-        }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+
+            // Write results.
+
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[threadIdx.x];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+            }
        }
        pos++;
-    } while (pos < end);
+    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 }
--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
-extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex,
+extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
            real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    extern __shared__ real3 bsplinesCache[];
-    real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
-    const real3 scale = make_real3(RECIP(PME_ORDER-1));
+    // Compute the index of the grid point each atom is associated with.
+    
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 pos = posq[i];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -11,11 +10,40 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
        real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
-        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z);
        pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
+    }
+}
+
+extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, const int2* __restrict__ pmeAtomGridIndex) {
+    real3 data[PME_ORDER];
+    const real scale = RECIP(PME_ORDER-1);
+    
+    // Process the atoms in spatially sorted order.  This improves efficiency when writing
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
+        real charge = posq[atom].w;
+        real3 force = make_real3(0);
+        real4 pos = posq[atom];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
+        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
+                                   ((int) t.y) % GRID_SIZE_Y,
+                                   ((int) t.z) % GRID_SIZE_Z);
+
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        data[PME_ORDER-1] = make_real3(0);
        data[1] = dr;
        data[0] = make_real3(1)-dr;
@@ -23,98 +51,46 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
            real div = RECIP(j-1);
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
            data[0] = div*(make_real3(1)-dr)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
-        for (int j = 0; j < PME_ORDER; j++) {
-            real3 d = data[j]; // Copy it as a workaround for a bug in CUDA 5.0
-            pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(d.x, d.y, d.z, pos.w);  // Storing the charge here improves cache coherency in the charge spreading kernel
-        }
-    }
-}
-
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x);
-    int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x);
-    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i) {
-        int2 atomData = pmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last) {
-            for (int j = last+1; j <= gridIndex; ++j)
-                pmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-    }
-
-    // Fill in values beyond the last atom.
-    
-    if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
-        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-        for (int j = last+1; j <= gridSize; ++j)
-            pmeAtomRange[j] = NUM_ATOMS;
-    }
-}
-
-#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
-extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
-        const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
-    int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
-    int iy = remainder/PME_ORDER;
-    int iz = remainder-iy*PME_ORDER;
-    __shared__ real4 theta[PME_ORDER];
-    __shared__ real charge[BUFFER_SIZE];
-    __shared__ int basex[BUFFER_SIZE];
-    __shared__ int basey[BUFFER_SIZE];
-    __shared__ int basez[BUFFER_SIZE];
-    if (ix < PME_ORDER) {
-        for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
-            // Load the next block of atoms into the buffers.
+        
+        // Spread the charge from this atom onto each grid point.
+         
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = gridIndex.x+ix;
+            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
+            real dx = data[ix].x;
+            
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = gridIndex.y+iy;
+                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*GRID_SIZE_Z;
+                real dy = data[iy].y;
+                
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;

-            int atomIndex = baseIndex+threadIdx.x;
-            if (atomIndex < NUM_ATOMS) {
-                real4 pos = posq[atomIndex];
-                charge[threadIdx.x] = pos.w;
-                pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-                pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-                pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-                basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
-                basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
-                basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
-            }
-            __syncthreads();
-            int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
-            for (int index = 0; index < lastIndex; index++) {
-                int atomIndex = index+baseIndex;
-                if (threadIdx.x < PME_ORDER)
-                    theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS];
-                __syncthreads();
-                real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
-                int x = basex[index]+ix;
-                int y = basey[index]+iy;
-                int z = basez[index]+iz;
-                x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-                y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    real add = charge*dx*dy*data[iz].z;
 #ifdef USE_DOUBLE_PRECISION
-                unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
-                atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
+                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
 #elif __CUDA_ARCH__ < 200
-                unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
-                int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
-                gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
-                atomicAdd(&ulonglong_p[gridIndex],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
+                    int gridIndex = index;
+                    gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
+                    atomicAdd(&ulonglong_p[gridIndex],  static_cast<unsigned long long>((long long) (add*0x100000000)));
 #else
-                atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR);
+                    atomicAdd(&originalPmeGrid[index], add*EPSILON_FACTOR);
 #endif
+                }
            }
        }
    }
@@ -182,48 +158,52 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ e
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
 
-	real energy = 0;
+    real energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
        // real indices
        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
        int ky = remainder/(GRID_SIZE_Z);
        int kz = remainder-ky*(GRID_SIZE_Z);
-		int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
-		int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
-		int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
-		real mhx = mx*invPeriodicBoxSize.x;
-		real mhy = my*invPeriodicBoxSize.y;
-		real mhz = mz*invPeriodicBoxSize.z;
-		real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
-		real bx = pmeBsplineModuliX[kx];
-		real by = pmeBsplineModuliY[ky];
-		real bz = pmeBsplineModuliZ[kz];
-		real denom = m2*bx*by*bz;
-		real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+    	int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+    	int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+    	int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+    	real mhx = mx*invPeriodicBoxSize.x;
+    	real mhy = my*invPeriodicBoxSize.y;
+    	real mhz = mz*invPeriodicBoxSize.z;
+    	real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+    	real bx = pmeBsplineModuliX[kx];
+    	real by = pmeBsplineModuliY[ky];
+    	real bz = pmeBsplineModuliZ[kz];
+    	real denom = m2*bx*by*bz;
+    	real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;

-		if(kz >= (GRID_SIZE_Z/2+1)) {
-			kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
-			ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
-			kz = GRID_SIZE_Z-kz;
-		} 
-		int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
-		real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
-		if (kx != 0 || ky != 0 || kz != 0) {
-			energy += eterm*(grid.x*grid.x + grid.y*grid.y);
-		}
+    	if(kz >= (GRID_SIZE_Z/2+1)) {
+        	kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
+        	ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
+        	kz = GRID_SIZE_Z-kz;
+        } 
+    	int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+    	real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
+    	if (kx != 0 || ky != 0 || kz != 0) {
+        	energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+        }
    }
 	energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
 }

 extern "C" __global__
 void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, const int2* __restrict__ pmeAtomGridIndex) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
-     
-    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+    
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
        real3 force = make_real3(0);
        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -233,8 +213,8 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
-                                 ((int) t.y) % GRID_SIZE_Y,
-                                 ((int) t.z) % GRID_SIZE_Z);
+                                   ((int) t.y) % GRID_SIZE_Y,
+                                   ((int) t.z) % GRID_SIZE_Z);

        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.
@@ -243,7 +223,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
        data[PME_ORDER-1] = make_real3(0);
        data[1] = dr;
        data[0] = make_real3(1)-dr;
-
        for (int j = 3; j < PME_ORDER; j++) {
            real div = RECIP(j-1);
            data[j-1] = div*dr*data[j-2];
@@ -252,15 +231,13 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
            data[0] = div*(make_real3(1)-dr)*data[0];
        }
        ddata[0] = -data[0];
-         
        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
-
+        
        // Compute the force on this atom.
         
        for (int ix = 0; ix < PME_ORDER; ix++) {

--- a/platforms/cuda/src/kernels/sort.cu
+++ b/platforms/cuda/src/kernels/sort.cu
@@ -3,7 +3,49 @@ __device__ KEY_TYPE getValue(DATA_TYPE value) {
 }

 extern "C" {
+
+/**
+ * Sort a list that is short enough to entirely fit in local memory.  This is executed as
+ * a single thread block.
+ */
+__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
+    // Load the data into local memory.
    
+    extern __shared__ DATA_TYPE dataBuffer[];
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        dataBuffer[index] = data[index];
+    __syncthreads();
+
+    // Perform a bitonic sort in local memory.
+
+    for (unsigned int k = 2; k < 2*length; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                int ixj = i^j;
+                if (ixj > i && ixj < length) {
+                    DATA_TYPE value1 = dataBuffer[i];
+                    DATA_TYPE value2 = dataBuffer[ixj];
+                    bool ascending = ((i&k) == 0);
+                    for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
+                        ascending = ((i&mask) == 0 ? !ascending : ascending);
+                    KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                    if (lowKey > highKey) {
+                        dataBuffer[i] = value2;
+                        dataBuffer[ixj] = value1;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write the data back to global memory.
+
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        data[index] = dataBuffer[index];
+}
+
 /**
 * Calculate the minimum and maximum value in the array to be sorted.  This kernel
 * is executed as a single work group.

--- a/platforms/cuda/src/kernels/torsionForce.cu
+++ b/platforms/cuda/src/kernels/torsionForce.cu
@@ -16,12 +16,12 @@ if (cosangle > 0.99f || cosangle < -0.99f) {
        theta = PI-theta;
 }
 else
-   theta = acos(cosangle);
+   theta = ACOS(cosangle);
 theta = (dot(v0, cp1) >= 0 ? theta : -theta);
 COMPUTE_FORCE
 real normCross1 = dot(cp0, cp0);
 real normSqrBC = dot(v1, v1);
-real normBC = sqrt(normSqrBC);
+real normBC = SQRT(normSqrBC);
 real normCross2 = dot(cp1, cp1);
 real dp = RECIP(normSqrBC);
 real4 ff = make_real4((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);

--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -93,8 +93,8 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
        __syncthreads();
    }
    if (threadIdx.x == 0) {
-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.

--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -438,9 +438,9 @@ void testLargeSystem() {
    }
    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }
-
+/*
 void testBlockInteractions(bool periodic) {
-    const int blockSize = 32;
+    const int blockSize = CudaContext::TileSize;
    const int numBlocks = 100;
    const int numParticles = blockSize*numBlocks;
    const double cutoff = 1.0;
@@ -597,6 +597,8 @@ void testBlockInteractions(bool periodic) {
        if (!hasInteractions[i]) {
            unsigned int y = (unsigned int) std::floor(numBlocks+0.5-std::sqrt((numBlocks+0.5)*(numBlocks+0.5)-2*i));
            unsigned int x = (i-y*numBlocks+y*(y+1)/2);
+            if (x == y)
+                continue; // This block has exclusions, so it will not be in the neighbor list.
            for (int atom1 = 0; atom1 < blockSize; ++atom1) {
                double4 pos1 = posq[x*blockSize+atom1];
                for (int atom2 = 0; atom2 < blockSize; ++atom2) {
@@ -613,14 +615,14 @@ void testBlockInteractions(bool periodic) {
                }
            }
        }
-}
+}*/

 void testDispersionCorrection() {
    // Create a box full of identical particles.

    int gridSize = 5;
    int numParticles = gridSize*gridSize*gridSize;
-    double boxSize = gridSize*0.5;
+    double boxSize = gridSize*0.7;
    double cutoff = boxSize/3;
    System system;
    VerletIntegrator integrator(0.01);
@@ -822,8 +824,8 @@ int main(int argc, char* argv[]) {
        testCutoff14();
        testPeriodic();
        testLargeSystem();
-        testBlockInteractions(false);
-        testBlockInteractions(true);
+        //testBlockInteractions(false);
+        //testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
        testParallelComputation(false);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -87,8 +87,7 @@ void verifySorting(vector<float> array) {
    ASSERT(elements1 == elements2);
 }

-void testUniformValues()
-{
+void testUniformValues() {
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

@@ -98,8 +97,7 @@ void testUniformValues()
    verifySorting(array);
 }

-void testLogValues()
-{
+void testLogValues() {
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

@@ -109,12 +107,23 @@ void testLogValues()
    verifySorting(array);
 }

+void testShortList() {
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<float> array(500);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) log(genrand_real2(sfmt));
+    verifySorting(array);
+}
+
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
        testUniformValues();
        testLogValues();
+        testShortList();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
@@ -99,43 +99,55 @@ void OpenCLBondedUtilities::initialize(const System& system) {
            numBuffers[i] = max(numBuffers[i], bufferCounter[i][j]);
    }
    
-    // Figure out how many force buffers will be required.
-    
-    for (int i = 0; i < numForces; i++)
-        numForceBuffers = max(numForceBuffers, numBuffers[i]);
-    int bufferLimit = max(numForceBuffers, (int) context.getPlatformData().contexts.size());
-    if (context.getNonbondedUtilities().getHasInteractions())
-        bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
-    
    // For efficiency, we want to merge multiple forces into a single kernel - but only if that
-    // won't increase the number of force buffers.  Figure out sets of forces that can be merged.
+    // won't increase the number of force buffers.
    
-    vector<int> unmerged(numForces);
-    for (int i = 0; i < numForces; i++)
-        unmerged[i] = i;
-    for (int i = 0; i < numForces; i++)
-        for (int j = i-1; j >= 0; j--) {
-            if (numBuffers[unmerged[j]] <= numBuffers[unmerged[j+1]])
-                break;
-            int temp = unmerged[j+1];
-            unmerged[j+1] = unmerged[j];
-            unmerged[j] = temp;
-        }
-    while (unmerged.size() > 0) {
-        int sum = numBuffers[unmerged.back()];
-        int i;
-        for (i = 0; i < (int) unmerged.size()-1; i++) {
-            if (sum+numBuffers[unmerged[i]] > bufferLimit)
-                break;
-            sum += numBuffers[unmerged[i]];
-        }
+    if (context.getSupports64BitGlobalAtomics()) {
+        // Put all the forces in the same set.
+        
+        numForceBuffers = 1;
        forceSets.push_back(vector<int>());
-        for (int j = 0; j < i; j++)
-            forceSets.back().push_back(unmerged[j]);
-        forceSets.back().push_back(unmerged.back());
-        for (int j = 0; j < i; j++)
-            unmerged.erase(unmerged.begin());
-        unmerged.pop_back();
+        for (int i = 0; i < numForces; i++)
+            forceSets[0].push_back(i);
+    }
+    else {
+        // Figure out how many force buffers will be required.
+    
+        for (int i = 0; i < numForces; i++)
+            numForceBuffers = max(numForceBuffers, numBuffers[i]);
+        int bufferLimit = max(numForceBuffers, (int) context.getPlatformData().contexts.size());
+        if (context.getNonbondedUtilities().getHasInteractions())
+            bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
+        
+        // Figure out sets of forces that can be merged.
+        
+        vector<int> unmerged(numForces);
+        for (int i = 0; i < numForces; i++)
+            unmerged[i] = i;
+        for (int i = 0; i < numForces; i++)
+            for (int j = i-1; j >= 0; j--) {
+                if (numBuffers[unmerged[j]] <= numBuffers[unmerged[j+1]])
+                    break;
+                int temp = unmerged[j+1];
+                unmerged[j+1] = unmerged[j];
+                unmerged[j] = temp;
+            }
+        while (unmerged.size() > 0) {
+            int sum = numBuffers[unmerged.back()];
+            int i;
+            for (i = 0; i < (int) unmerged.size()-1; i++) {
+                if (sum+numBuffers[unmerged[i]] > bufferLimit)
+                    break;
+                sum += numBuffers[unmerged[i]];
+            }
+            forceSets.push_back(vector<int>());
+            for (int j = 0; j < i; j++)
+                forceSets.back().push_back(unmerged[j]);
+            forceSets.back().push_back(unmerged.back());
+            for (int j = 0; j < i; j++)
+                unmerged.erase(unmerged.begin());
+            unmerged.pop_back();
+        }
    }

    // Update the buffer indices based on merged sets.
@@ -162,9 +174,13 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        const vector<int>& set = *iter;
        int setSize = set.size();
        stringstream s;
+        s<<"#ifdef SUPPORTS_64_BIT_ATOMICS\n";
+        s<<"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
+        s<<"#endif\n";
        for (int i = 0; i < (int) prefixCode.size(); i++)
            s<<prefixCode[i];
-        s<<"__kernel void computeBondedForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
+        string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4");
+        s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
        for (int i = 0; i < setSize; i++) {
            int force = set[i];
            string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
@@ -219,10 +235,17 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
    s<<computeForce<<"\n";
    for (int i = 0; i < numAtoms; i++) {
        s<<"    {\n";
-        s<<"    unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
-        s<<"    real4 force = forceBuffers[offset];\n";
-        s<<"    force.xyz += force"<<(i+1)<<".xyz;\n";
-        s<<"    forceBuffers[offset] = force;\n";
+        if (context.getSupports64BitGlobalAtomics()) {
+            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"], (long) (force"<<(i+1)<<".x*0x100000000));\n";
+            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (long) (force"<<(i+1)<<".y*0x100000000));\n";
+            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], (long) (force"<<(i+1)<<".z*0x100000000));\n";
+        }
+        else {
+            s<<"    unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
+            s<<"    real4 force = forceBuffers[offset];\n";
+            s<<"    force.xyz += force"<<(i+1)<<".xyz;\n";
+            s<<"    forceBuffers[offset] = force;\n";
+        }
        s<<"    }\n";
    }
    s<<"}\n";
@@ -235,7 +258,10 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
        for (int i = 0; i < (int) forceSets.size(); i++) {
            int index = 0;
            cl::Kernel& kernel = kernels[i];
-            kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
+            if (context.getSupports64BitGlobalAtomics())
+                kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer());
+            else
+                kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
            kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
            kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
            index++;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -97,6 +97,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            // Try to figure out which device is the fastest.

            int bestSpeed = -1;
+            bool bestSupportsDouble = false;
            for (int i = 0; i < (int) devices.size(); i++) {
                if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")
                    continue; // Don't use AMD GPUs on OS X due to serious bugs.
@@ -135,9 +136,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                    }
                }
                int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
-                if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
+                bool supportsDouble = (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
+                if (maxSize >= minThreadBlockSize && speed > bestSpeed && (supportsDouble || !bestSupportsDouble)) {
                    deviceIndex = i;
                    bestSpeed = speed;
+                    bestSupportsDouble = supportsDouble;
                }
            }
        }
@@ -173,9 +176,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            }
        }
        else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
-            // Disable 64 bit atomics.  A future version of the driver will support them, but until we can test that,
-            // it's safest not to use them.
-            supports64BitGlobalAtomics = false;
            if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
                /// \todo Is 6 a good value for the OpenCL CPU device?
                // numThreadBlocksPerComputeUnit = ?;
@@ -190,14 +190,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                    // check for errors.
                    try {
 #ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
-                        // AMD has both 32 and 64 width SIMDs. Can determine by using:
-                        // simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
                        // Must catch cl:Error as will fail if runtime does not support queries.
-                        // However, the 32 width NVIDIA kernels do not have all the necessary
-                        // barriers and so will not work for AMD.
-                        // So for now leave default of 1 which will use the default kernels.

                        cl_uint simdPerComputeUnit = device.getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>();
+                        simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
+
                        // If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
                        // set instead of the VLIW instruction set. It therefore needs more thread blocks per
                        // compute unit to hide memory latency.
@@ -226,6 +223,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
        if (supportsDoublePrecision)
            compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
+        if (simdWidth >= 32)
+            compilationDefines["SYNC_WARPS"] = "";
+        else
+            compilationDefines["SYNC_WARPS"] = "barrier(CLK_LOCAL_MEM_FENCE)";
        vector<cl::Device> contextDevices;
        contextDevices.push_back(device);
        cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[platformIndex](), 0};

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -36,27 +36,24 @@ using namespace OpenMM;
 using namespace std;

 OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
-    zkernel = createKernel(xsize, ysize, zsize);
-    xkernel = createKernel(ysize, zsize, xsize);
-    ykernel = createKernel(zsize, xsize, ysize);
+    zkernel = createKernel(xsize, ysize, zsize, zthreads);
+    xkernel = createKernel(ysize, zsize, xsize, xthreads);
+    ykernel = createKernel(zsize, xsize, ysize, ythreads);
 }

 void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
-    int maxSize = xkernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
-    if (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU)
-        maxSize = 1;
    zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
    zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
    zkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize));
+    context.executeKernel(zkernel, xsize*ysize*zsize, zthreads);
    xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
    xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
    xkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));
+    context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
    ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
    ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
    ykernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize));
+    context.executeKernel(ykernel, xsize*ysize*zsize, ythreads);
 }

 int OpenCLFFT3D::findLegalDimension(int minimum) {
@@ -66,7 +63,7 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
        // Attempt to factor the current value.

        int unfactored = minimum;
-        for (int factor = 2; factor < 6; factor++) {
+        for (int factor = 2; factor < 8; factor++) {
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
@@ -76,9 +73,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
    }
 }

-cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
+cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
    bool loopRequired = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    stringstream source;
+    int blocksPerGroup = (loopRequired ? 1 : max(1, 256/zsize));
    int stage = 0;
    int L = zsize;
    int m = 1;
@@ -88,22 +86,85 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
    while (L > 1) {
        int input = stage%2;
        int output = 1-input;
+        int radix;
+        if (L%7 == 0)
+            radix = 7;
+        else if (L%5 == 0)
+            radix = 5;
+        else if (L%4 == 0)
+            radix = 4;
+        else if (L%3 == 0)
+            radix = 3;
+        else if (L%2 == 0)
+            radix = 2;
+        else
+            throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
        source<<"{\n";
-        if (L%5 == 0) {
-            L = L/5;
-            source<<"// Pass "<<(stage+1)<<" (radix 5)\n";
-            if (loopRequired)
-                source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
-            else {
-                source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
-                source<<"int i = get_local_id(0);\n";
-            }
-            source<<"int j = i/"<<m<<";\n";
-            source<<"real2 c0 = data"<<input<<"[i];\n";
-            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"real2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n";
+        L = L/radix;
+        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
+        if (loopRequired) {
+            source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
+            source<<"int base = i;\n";
+        }
+        else {
+            source<<"if (get_local_id(0) < "<<(blocksPerGroup*L*m)<<") {\n";
+            source<<"int block = get_local_id(0)/"<<(L*m)<<";\n";
+            source<<"int i = get_local_id(0)-block*"<<(L*m)<<";\n";
+            source<<"int base = i+block*"<<zsize<<";\n";
+        }
+        source<<"int j = i/"<<m<<";\n";
+        if (radix == 7) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
+            source<<"real2 c5 = data"<<input<<"[base+"<<(5*L*m)<<"];\n";
+            source<<"real2 c6 = data"<<input<<"[base+"<<(6*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c6;\n";
+            source<<"real2 d1 = c1-c6;\n";
+            source<<"real2 d2 = c2+c5;\n";
+            source<<"real2 d3 = c2-c5;\n";
+            source<<"real2 d4 = c4+c3;\n";
+            source<<"real2 d5 = c4-c3;\n";
+            source<<"real2 d6 = d2+d0;\n";
+            source<<"real2 d7 = d5+d3;\n";
+            source<<"real2 b0 = c0+d6+d4;\n";
+            source<<"real2 b1 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)+cos(6*M_PI/7))/3-1)<<"*(d6+d4);\n";
+            source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
+            source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
+            source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
+            source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
+            source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
+            source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
+            source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
+            source<<"real2 t0 = b0+b1;\n";
+            source<<"real2 t1 = b2+b3;\n";
+            source<<"real2 t2 = b4-b3;\n";
+            source<<"real2 t3 = -b2-b4;\n";
+            source<<"real2 t4 = b6+b7;\n";
+            source<<"real2 t5 = b8-b7;\n";
+            source<<"real2 t6 = -b8-b6;\n";
+            source<<"real2 t7 = t0+t1;\n";
+            source<<"real2 t8 = t0+t2;\n";
+            source<<"real2 t9 = t0+t3;\n";
+            source<<"real2 t10 = (real2) (t4.y+b5.y, -(t4.x+b5.x));\n";
+            source<<"real2 t11 = (real2) (t5.y+b5.y, -(t5.x+b5.x));\n";
+            source<<"real2 t12 = (real2) (t6.y+b5.y, -(t6.x+b5.x));\n";
+            source<<"data"<<output<<"[base+6*j*"<<m<<"] = b0;\n";
+            source<<"data"<<output<<"[base+(6*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(7*L)<<"], t7-t10);\n";
+            source<<"data"<<output<<"[base+(6*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(7*L)<<"], t9-t12);\n";
+            source<<"data"<<output<<"[base+(6*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(7*L)<<"], t8+t11);\n";
+            source<<"data"<<output<<"[base+(6*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(7*L)<<"], t8-t11);\n";
+            source<<"data"<<output<<"[base+(6*j+5)*"<<m<<"] = multiplyComplex(w[j*"<<(5*zsize)<<"/"<<(7*L)<<"], t9+t12);\n";
+            source<<"data"<<output<<"[base+(6*j+6)*"<<m<<"] = multiplyComplex(w[j*"<<(6*zsize)<<"/"<<(7*L)<<"], t7+t10);\n";
+        }
+        else if (radix == 5) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
            source<<"real2 d0 = c1+c4;\n";
            source<<"real2 d1 = c2+c3;\n";
            source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
@@ -116,80 +177,45 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
            string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
            source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
            source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
-            source<<"data"<<output<<"[i+4*j*"<<m<<"] = c0+d4;\n";
-            source<<"data"<<output<<"[i+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
-            source<<"data"<<output<<"[i+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
-            source<<"data"<<output<<"[i+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
-            source<<"data"<<output<<"[i+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
-            source<<"}\n";
-            m = m*5;
+            source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
+            source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
+            source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
+            source<<"data"<<output<<"[base+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
+            source<<"data"<<output<<"[base+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
        }
-        else if (L%4 == 0) {
-            L = L/4;
-            source<<"// Pass "<<(stage+1)<<" (radix 4)\n";
-            if (loopRequired)
-                source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
-            else {
-                source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
-                source<<"int i = get_local_id(0);\n";
-            }
-            source<<"int j = i/"<<m<<";\n";
-            source<<"real2 c0 = data"<<input<<"[i];\n";
-            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
+        else if (radix == 4) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
            source<<"real2 d0 = c0+c2;\n";
            source<<"real2 d1 = c0-c2;\n";
            source<<"real2 d2 = c1+c3;\n";
            source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
-            source<<"data"<<output<<"[i+3*j*"<<m<<"] = d0+d2;\n";
-            source<<"data"<<output<<"[i+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
-            source<<"data"<<output<<"[i+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
-            source<<"data"<<output<<"[i+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
-            source<<"}\n";
-            m = m*4;
+            source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
+            source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
+            source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
+            source<<"data"<<output<<"[base+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
        }
-        else if (L%3 == 0) {
-            L = L/3;
-            source<<"// Pass "<<(stage+1)<<" (radix 3)\n";
-            if (loopRequired)
-                source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
-            else {
-                source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
-                source<<"int i = get_local_id(0);\n";
-            }
-            source<<"int j = i/"<<m<<";\n";
-            source<<"real2 c0 = data"<<input<<"[i];\n";
-            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
+        else if (radix == 3) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
            source<<"real2 d0 = c1+c2;\n";
            source<<"real2 d1 = c0-0.5f*d0;\n";
            source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
-            source<<"data"<<output<<"[i+2*j*"<<m<<"] = c0+d0;\n";
-            source<<"data"<<output<<"[i+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
-            source<<"data"<<output<<"[i+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
-            source<<"}\n";
-            m = m*3;
+            source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
+            source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
+            source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
        }
-        else if (L%2 == 0) {
-            L = L/2;
-            source<<"// Pass "<<(stage+1)<<" (radix 2)\n";
-            if (loopRequired)
-                source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
-            else {
-                source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
-                source<<"int i = get_local_id(0);\n";
-            }
-            source<<"int j = i/"<<m<<";\n";
-            source<<"real2 c0 = data"<<input<<"[i];\n";
-            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"data"<<output<<"[i+j*"<<m<<"] = c0+c1;\n";
-            source<<"data"<<output<<"[i+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
-            source<<"}\n";
-            m = m*2;
+        else if (radix == 2) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"data"<<output<<"[base+j*"<<m<<"] = c0+c1;\n";
+            source<<"data"<<output<<"[base+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
        }
-        else
-            throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
+        source<<"}\n";
+        m = m*radix;
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
        source<<"}\n";
        ++stage;
@@ -202,20 +228,22 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
        source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
    }
    else
-        source<<"out[y*(ZSIZE*XSIZE)+get_local_id(0)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
+        source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
    source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
    map<string, string> replacements;
    replacements["XSIZE"] = context.intToString(xsize);
    replacements["YSIZE"] = context.intToString(ysize);
    replacements["ZSIZE"] = context.intToString(zsize);
+    replacements["BLOCKS_PER_GROUP"] = context.intToString(blocksPerGroup);
    replacements["M_PI"] = context.doubleToString(M_PI);
    replacements["COMPUTE_FFT"] = source.str();
    replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
    cl::Kernel kernel(program, "execFFT");
-    int bufferSize = zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+    int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
    kernel.setArg(3, bufferSize, NULL);
    kernel.setArg(4, bufferSize, NULL);
    kernel.setArg(5, bufferSize, NULL);
+    threads = (loopRequired ? 1 : blocksPerGroup*zsize);
    return kernel;
 }
--- a/platforms/opencl/src/OpenCLFFT3D.h
+++ b/platforms/opencl/src/OpenCLFFT3D.h
@@ -81,8 +81,9 @@ public:
     */
    static int findLegalDimension(int minimum);
 private:
-    cl::Kernel createKernel(int xsize, int ysize, int zsize);
+    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads);
    int xsize, ysize, zsize;
+    int xthreads, ythreads, zthreads;
    OpenCLContext& context;
    cl::Kernel xkernel, ykernel, zkernel;
 };

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -99,7 +99,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
-        ccmaConvergedBuffer(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
+        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
    // Create workspace arrays.

@@ -479,8 +479,6 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
        ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
        ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
-        ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
-        ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
        vector<mm_int2> atomsVec(ccmaAtoms->getSize());
        vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
@@ -660,24 +658,28 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
    cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
    vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
-    vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
-    setPosqCorrectionArg(context, vsitePositionKernel, 1);
-    vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
+    int index = 0;
+    vsitePositionKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
+    if (context.getUseMixedPrecision())
+        vsitePositionKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
    vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
-    vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
-    setPosqCorrectionArg(context, vsiteForceKernel, 1);
-    // Skip argument 2: the force array hasn't been created yet.
-    vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(4, vsite2AvgWeights->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(6, vsite3AvgWeights->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(8, vsiteOutOfPlaneWeights->getDeviceBuffer());
+    index = 0;
+    vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
+    index++; // Skip argument 1: the force array hasn't been created yet.
+    if (context.getUseMixedPrecision())
+        vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
    numVsites = num2Avg+num3Avg+numOutOfPlane;
 }

@@ -718,8 +720,6 @@ OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
        delete ccmaDelta2;
    if (ccmaConverged != NULL)
        delete ccmaConverged;
-    if (ccmaConvergedBuffer != NULL)
-        delete ccmaConvergedBuffer;
    if (vsite2AvgAtoms != NULL)
        delete vsite2AvgAtoms;
    if (vsite2AvgWeights != NULL)
@@ -807,6 +807,7 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
                ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
            else
                ccmaDirectionsKernel.setArg<void*>(3, NULL);
+            ccmaDirectionsKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
@@ -834,23 +835,19 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
        context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
        const int checkInterval = 4;
        cl::Event event;
+        int* converged = (int*) context.getPinnedBuffer();
        for (int i = 0; i < 150; i++) {
            ccmaForceKernel.setArg<cl_int>(7, i);
-            if (i == 0) {
-                ccmaConvergedMemory[0] = 1;
-                ccmaConvergedMemory[1] = 0;
-                context.getQueue().enqueueWriteBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), ccmaConvergedMemory);
-            }
            context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
            if ((i+1)%checkInterval == 0)
-                context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), ccmaConvergedMemory, NULL, &event);
+                context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
            ccmaMultiplyKernel.setArg<cl_int>(5, i);
            context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
            ccmaUpdateKernel.setArg<cl_int>(8, i);
            context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                event.wait();
-                if (ccmaConvergedMemory[i%2])
+                if (converged[i%2])
                    break;
            }
        }
@@ -864,7 +861,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {

 void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
    if (numVsites > 0) {
-        vsiteForceKernel.setArg<cl::Buffer>(2, context.getForce().getDeviceBuffer());
+        vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
        context.executeKernel(vsiteForceKernel, numVsites);
    }
 }

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.h
@@ -141,8 +141,6 @@ private:
    OpenCLArray* ccmaDelta1;
    OpenCLArray* ccmaDelta2;
    OpenCLArray* ccmaConverged;
-    cl::Buffer* ccmaConvergedBuffer;
-    cl_int* ccmaConvergedMemory;
    OpenCLArray* vsite2AvgAtoms;
    OpenCLArray* vsite2AvgWeights;
    OpenCLArray* vsite3AvgAtoms;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2010 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -1345,8 +1345,6 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
        delete pmeBsplineModuliZ;
    if (pmeBsplineTheta != NULL)
        delete pmeBsplineTheta;
-    if (pmeBsplineDTheta != NULL)
-        delete pmeBsplineDTheta;
    if (pmeAtomRange != NULL)
        delete pmeAtomRange;
    if (pmeAtomGridIndex != NULL)
@@ -1468,6 +1466,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
        pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
        pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
+        bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
+        if (deviceIsCpu)
+            pmeDefines["DEVICE_IS_CPU"] = "1";

        // Create required data structures.

@@ -1479,12 +1480,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
        pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
        pmeBsplineTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
-        bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
-        if (deviceIsCpu)
-            pmeBsplineDTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineDTheta");
        pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
        pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
-        sort = new OpenCLSort<SortTrait>(cl, cl.getNumAtoms());
+        sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
        fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);

        // Initialize the b-spline moduli.
@@ -1608,12 +1606,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums->getDeviceBuffer());
        }
        if (pmeGrid != NULL) {
-            string file = (deviceIsCpu ? OpenCLKernelSources::pme_cpu : OpenCLKernelSources::pme);
-            cl::Program program = cl.createProgram(file, pmeDefines);
+            cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
            pmeUpdateBsplinesKernel = cl::Kernel(program, "updateBsplines");
            pmeAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
-            if (!deviceIsCpu)
-                pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
+            pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
            pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
            pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
            pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
@@ -1622,15 +1618,11 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(1, pmeBsplineTheta->getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg(2, OpenCLContext::ThreadBlockSize*PmeOrder*elementSize, NULL);
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(3, pmeAtomGridIndex->getDeviceBuffer());
-            if (deviceIsCpu)
-                pmeUpdateBsplinesKernel.setArg<cl::Buffer>(6, pmeBsplineDTheta->getDeviceBuffer());
            pmeAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
            pmeAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer());
            pmeAtomRangeKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
-            if (!deviceIsCpu) {
-                pmeZIndexKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
-                pmeZIndexKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
-            }
+            pmeZIndexKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
+            pmeZIndexKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
@@ -1641,16 +1633,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
-            interpolateForceThreads = (cl.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() > 2*128*PmeOrder*elementSize ? 128 : 64);
            pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
-            if (deviceIsCpu) {
-                pmeInterpolateForceKernel.setArg<cl::Buffer>(5, pmeBsplineTheta->getDeviceBuffer());
-                pmeInterpolateForceKernel.setArg<cl::Buffer>(6, pmeBsplineDTheta->getDeviceBuffer());
-            }
-            else
-                pmeInterpolateForceKernel.setArg(5, 2*interpolateForceThreads*PmeOrder*elementSize, NULL);
+            pmeInterpolateForceKernel.setArg<cl::Buffer>(5, pmeAtomGridIndex->getDeviceBuffer());
            if (cl.getSupports64BitGlobalAtomics()) {
                pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
@@ -1687,16 +1673,16 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        }
        else {
            sort->sort(*pmeAtomGridIndex);
-            setPeriodicBoxSizeArg(cl, pmeAtomRangeKernel, 3);
-            setInvPeriodicBoxSizeArg(cl, pmeAtomRangeKernel, 4);
-            cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
            if (cl.getSupports64BitGlobalAtomics()) {
                setPeriodicBoxSizeArg(cl, pmeSpreadChargeKernel, 5);
                setInvPeriodicBoxSizeArg(cl, pmeSpreadChargeKernel, 6);
-                cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
+                cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
                cl.executeKernel(pmeFinishSpreadChargeKernel, pmeGrid->getSize());
            }
            else {
+                setPeriodicBoxSizeArg(cl, pmeAtomRangeKernel, 3);
+                setInvPeriodicBoxSizeArg(cl, pmeAtomRangeKernel, 4);
+                cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
                setPeriodicBoxSizeArg(cl, pmeZIndexKernel, 2);
                setInvPeriodicBoxSizeArg(cl, pmeZIndexKernel, 3);
                cl.executeKernel(pmeZIndexKernel, cl.getNumAtoms());
@@ -1715,7 +1701,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        fft->execFFT(*pmeGrid2, *pmeGrid, false);
        setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3);
        setInvPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 4);
-        cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms(), interpolateForceThreads);
+        if (deviceIsCpu)
+            cl.executeKernel(pmeInterpolateForceKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
+        else
+            cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
    }
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    if (dispersionCoefficient != 0.0 && includeDirect) {
@@ -2078,8 +2067,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        hasCreatedKernels = true;
        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
        map<string, string> defines;
-        if (nb.getForceBufferPerAtomBlock())
-            defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
        if (nb.getUseCutoff())
            defines["USE_CUTOFF"] = "1";
        if (nb.getUsePeriodic())
@@ -2090,18 +2077,24 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
        defines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
        defines["FORCE_WORK_GROUP_SIZE"] = cl.intToString(nb.getForceThreadBlockSize());
+        defines["TILE_SIZE"] = cl.intToString(OpenCLContext::TileSize);
+        int numExclusionTiles = nb.getExclusionTiles().getSize();
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = cl.intToString(numExclusionTiles);
+        int numContexts = cl.getPlatformData().contexts.size();
+        int startExclusionIndex = cl.getContextIndex()*numExclusionTiles/numContexts;
+        int endExclusionIndex = (cl.getContextIndex()+1)*numExclusionTiles/numContexts;
+        defines["FIRST_EXCLUSION_TILE"] = cl.intToString(startExclusionIndex);
+        defines["LAST_EXCLUSION_TILE"] = cl.intToString(endExclusionIndex);
        string platformVendor = cl::Platform(cl.getDevice().getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_VENDOR>();
        if (platformVendor == "Apple")
            defines["USE_APPLE_WORKAROUND"] = "1";
        string file;
        if (deviceIsCpu)
            file = OpenCLKernelSources::gbsaObc_cpu;
-        else if (cl.getSIMDWidth() == 32)
-            file = OpenCLKernelSources::gbsaObc_nvidia;
        else
-            file = OpenCLKernelSources::gbsaObc_default;
+            file = OpenCLKernelSources::gbsaObc;
        cl::Program program = cl.createProgram(file, defines);
-        bool useLong = (cl.getSupports64BitGlobalAtomics() && !deviceIsCpu);
+        bool useLong = cl.getSupports64BitGlobalAtomics();
        int index = 0;
        computeBornSumKernel = cl::Kernel(program, "computeBornSum");
        computeBornSumKernel.setArg<cl::Buffer>(index++, (useLong ? longBornSum->getDeviceBuffer() : bornSum->getDeviceBuffer()));
@@ -2112,15 +2105,12 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
            index += 2; // The periodic box size arguments are set when the kernel is executed.
            computeBornSumKernel.setArg<cl_uint>(index++, maxTiles);
-            if (cl.getSIMDWidth() == 32 || deviceIsCpu)
-                computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
+            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getBlockCenters().getDeviceBuffer());
+            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingAtoms().getDeviceBuffer());
        }
        else
            computeBornSumKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
-        if (cl.getSIMDWidth() == 32) {
-            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getExclusionIndices().getDeviceBuffer());
-            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getExclusionRowIndices().getDeviceBuffer());
-        }
+        computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getExclusionTiles().getDeviceBuffer());
        force1Kernel = cl::Kernel(program, "computeGBSAForce1");
        index = 0;
        force1Kernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer().getDeviceBuffer() : cl.getForceBuffers().getDeviceBuffer()));
@@ -2133,15 +2123,12 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
            index += 2; // The periodic box size arguments are set when the kernel is executed.
            force1Kernel.setArg<cl_uint>(index++, maxTiles);
-            if (cl.getSIMDWidth() == 32 || deviceIsCpu)
-                force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
+            force1Kernel.setArg<cl::Buffer>(index++, nb.getBlockCenters().getDeviceBuffer());
+            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingAtoms().getDeviceBuffer());
        }
        else
            force1Kernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
-        if (cl.getSIMDWidth() == 32) {
-            force1Kernel.setArg<cl::Buffer>(index++, nb.getExclusionIndices().getDeviceBuffer());
-            force1Kernel.setArg<cl::Buffer>(index++, nb.getExclusionRowIndices().getDeviceBuffer());
-        }
+        force1Kernel.setArg<cl::Buffer>(index++, nb.getExclusionTiles().getDeviceBuffer());
        program = cl.createProgram(OpenCLKernelSources::gbsaObcReductions, defines);
        reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
        reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
@@ -2174,12 +2161,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
            maxTiles = nb.getInteractingTiles().getSize();
            computeBornSumKernel.setArg<cl::Buffer>(3, nb.getInteractingTiles().getDeviceBuffer());
            computeBornSumKernel.setArg<cl_uint>(7, maxTiles);
+            computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractingAtoms().getDeviceBuffer());
            force1Kernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
            force1Kernel.setArg<cl_uint>(9, maxTiles);
-            if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
-                computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer());
-                force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
-            }
+            force1Kernel.setArg<cl::Buffer>(11, nb.getInteractingAtoms().getDeviceBuffer());
        }
    }
    cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
@@ -2301,16 +2286,17 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
    // Record parameters and exclusions.

    int numParticles = force.getNumParticles();
-    params = new OpenCLParameterSet(cl, force.getNumPerParticleParameters(), numParticles, "customGBParameters", true);
-    computedValues = new OpenCLParameterSet(cl, force.getNumComputedValues(), numParticles, "customGBComputedValues", true, cl.getUseDoublePrecision());
+    int paddedNumParticles = cl.getPaddedNumAtoms();
+    int numParams = force.getNumPerParticleParameters();
+    params = new OpenCLParameterSet(cl, force.getNumPerParticleParameters(), paddedNumParticles, "customGBParameters", true);
+    computedValues = new OpenCLParameterSet(cl, force.getNumComputedValues(), paddedNumParticles, "customGBComputedValues", true, cl.getUseDoublePrecision());
    if (force.getNumGlobalParameters() > 0)
        globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customGBGlobals", CL_MEM_READ_ONLY);
-    vector<vector<cl_float> > paramVector(numParticles);
+    vector<vector<cl_float> > paramVector(paddedNumParticles, vector<cl_float>(numParams, 0));
    vector<vector<int> > exclusionList(numParticles);
    for (int i = 0; i < numParticles; i++) {
        vector<double> parameters;
        force.getParticleParameters(i, parameters);
-        paramVector[i].resize(parameters.size());
        for (int j = 0; j < (int) parameters.size(); j++)
            paramVector[i][j] = (cl_float) parameters[j];
        exclusionList[i].push_back(i);
@@ -2402,7 +2388,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        }
    }
    bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
-    bool useLong = (cl.getSupports64BitGlobalAtomics() && !deviceIsCpu);
+    bool useLong = cl.getSupports64BitGlobalAtomics();
    if (useLong) {
        longEnergyDerivs = OpenCLArray::create<cl_long>(cl, force.getNumComputedValues()*cl.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
        energyDerivs = new OpenCLParameterSet(cl, force.getNumComputedValues(), cl.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
@@ -2465,30 +2451,24 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
        replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
        replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
-        map<string, string> defines;
-        if (cl.getNonbondedUtilities().getForceBufferPerAtomBlock())
-            defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
        if (useCutoff)
-            defines["USE_CUTOFF"] = "1";
+            pairValueDefines["USE_CUTOFF"] = "1";
        if (usePeriodic)
-            defines["USE_PERIODIC"] = "1";
+            pairValueDefines["USE_PERIODIC"] = "1";
        if (useExclusionsForValue)
-            defines["USE_EXCLUSIONS"] = "1";
-        if (cl.getSIMDWidth() == 32)
-            defines["WARPS_PER_GROUP"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
-        defines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
-        defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
-        defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
-        defines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
+            pairValueDefines["USE_EXCLUSIONS"] = "1";
+        pairValueDefines["FORCE_WORK_GROUP_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
+        pairValueDefines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        pairValueDefines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
+        pairValueDefines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
+        pairValueDefines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
+        pairValueDefines["TILE_SIZE"] = cl.intToString(OpenCLContext::TileSize);
        string file;
        if (deviceIsCpu)
            file = OpenCLKernelSources::customGBValueN2_cpu;
-        else if (cl.getSIMDWidth() == 32)
-            file = OpenCLKernelSources::customGBValueN2_nvidia;
        else
-            file = OpenCLKernelSources::customGBValueN2_default;
-        cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
-        pairValueKernel = cl::Kernel(program, "computeN2Value");
+            file = OpenCLKernelSources::customGBValueN2;
+        pairValueSrc = cl.replaceStrings(file, replacements);
        if (useExclusionsForValue)
            cl.getNonbondedUtilities().requestExclusions(exclusionList);
    }
@@ -2664,30 +2644,24 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
        replacements["DECLARE_TEMP_BUFFERS"] = declareTemps.str();
        replacements["SET_TEMP_BUFFERS"] = setTemps.str();
-        map<string, string> defines;
-        if (cl.getNonbondedUtilities().getForceBufferPerAtomBlock())
-            defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
        if (useCutoff)
-            defines["USE_CUTOFF"] = "1";
+            pairEnergyDefines["USE_CUTOFF"] = "1";
        if (usePeriodic)
-            defines["USE_PERIODIC"] = "1";
+            pairEnergyDefines["USE_PERIODIC"] = "1";
        if (anyExclusions)
-            defines["USE_EXCLUSIONS"] = "1";
-        if (cl.getSIMDWidth() == 32)
-            defines["WARPS_PER_GROUP"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
-        defines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
-        defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
-        defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
-        defines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
+            pairEnergyDefines["USE_EXCLUSIONS"] = "1";
+        pairEnergyDefines["FORCE_WORK_GROUP_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
+        pairEnergyDefines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        pairEnergyDefines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
+        pairEnergyDefines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
+        pairEnergyDefines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
+        pairEnergyDefines["TILE_SIZE"] = cl.intToString(OpenCLContext::TileSize);
        string file;
        if (deviceIsCpu)
            file = OpenCLKernelSources::customGBEnergyN2_cpu;
-        else if (cl.getSIMDWidth() == 32)
-            file = OpenCLKernelSources::customGBEnergyN2_nvidia;
        else
-            file = OpenCLKernelSources::customGBEnergyN2_default;
-        cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
-        pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
+            file = OpenCLKernelSources::customGBEnergyN2;
+        pairEnergySrc = cl.replaceStrings(file, replacements);
    }
    {
        // Create the kernel to reduce the derivatives and calculate per-particle energy terms.
@@ -2943,8 +2917,41 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
    int elementSize = (cl.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
+        
+        // These two kernels can't be compiled in initialize(), because the nonbonded utilities object
+        // has not yet been initialized then.
+
+        {
+            int numExclusionTiles = nb.getExclusionTiles().getSize();
+            pairValueDefines["NUM_TILES_WITH_EXCLUSIONS"] = cl.intToString(numExclusionTiles);
+            int numContexts = cl.getPlatformData().contexts.size();
+            int startExclusionIndex = cl.getContextIndex()*numExclusionTiles/numContexts;
+            int endExclusionIndex = (cl.getContextIndex()+1)*numExclusionTiles/numContexts;
+            pairValueDefines["FIRST_EXCLUSION_TILE"] = cl.intToString(startExclusionIndex);
+            pairValueDefines["LAST_EXCLUSION_TILE"] = cl.intToString(endExclusionIndex);
+            cl::Program program = cl.createProgram(pairValueSrc, pairValueDefines);
+            pairValueKernel = cl::Kernel(program, "computeN2Value");
+            pairValueSrc = "";
+            pairValueDefines.clear();
+        }
+        {
+            int numExclusionTiles = nb.getExclusionTiles().getSize();
+            pairEnergyDefines["NUM_TILES_WITH_EXCLUSIONS"] = cl.intToString(numExclusionTiles);
+            int numContexts = cl.getPlatformData().contexts.size();
+            int startExclusionIndex = cl.getContextIndex()*numExclusionTiles/numContexts;
+            int endExclusionIndex = (cl.getContextIndex()+1)*numExclusionTiles/numContexts;
+            pairEnergyDefines["FIRST_EXCLUSION_TILE"] = cl.intToString(startExclusionIndex);
+            pairEnergyDefines["LAST_EXCLUSION_TILE"] = cl.intToString(endExclusionIndex);
+            cl::Program program = cl.createProgram(pairEnergySrc, pairEnergyDefines);
+            pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
+            pairEnergySrc = "";
+            pairEnergyDefines.clear();
+        }
+
+        // Set arguments for kernels.
+        
        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
-        bool useLong = (cl.getSupports64BitGlobalAtomics() && !deviceIsCpu);
+        bool useLong = cl.getSupports64BitGlobalAtomics();
        if (useLong) {
            longValueBuffers = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "customGBLongValueBuffers");
            cl.addAutoclearBuffer(*longValueBuffers);
@@ -2959,20 +2966,16 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        pairValueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
        pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
-        pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
-        pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
+        pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionTiles().getDeviceBuffer());
        pairValueKernel.setArg<cl::Buffer>(index++, useLong ? longValueBuffers->getDeviceBuffer() : valueBuffers->getDeviceBuffer());
        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*elementSize, NULL);
-        /// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
-        /// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
-        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*elementSize, NULL);
        if (nb.getUseCutoff()) {
            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
            index += 2; // Periodic box size arguments are set when the kernel is executed.
            pairValueKernel.setArg<cl_uint>(index++, maxTiles);
-            if (cl.getSIMDWidth() == 32 || deviceIsCpu)
-                pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
+            pairValueKernel.setArg<cl::Buffer>(index++, nb.getBlockCenters().getDeviceBuffer());
+            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingAtoms().getDeviceBuffer());
        }
        else
            pairValueKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
@@ -3013,18 +3016,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
-        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
-        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
-        /// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
-        /// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
-        pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : (cl.getSIMDWidth() == 32 ? 1 : nb.getForceThreadBlockSize()))*4*elementSize, NULL);
+        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionTiles().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
            index += 2; // Periodic box size arguments are set when the kernel is executed.
            pairEnergyKernel.setArg<cl_uint>(index++, maxTiles);
-            if (cl.getSIMDWidth() == 32 || deviceIsCpu)
-                pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
+            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getBlockCenters().getDeviceBuffer());
+            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingAtoms().getDeviceBuffer());
        }
        else
            pairEnergyKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
@@ -3108,20 +3107,18 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
            globals->upload(globalParamValues);
    }
    if (nb.getUseCutoff()) {
-        setPeriodicBoxSizeArg(cl, pairValueKernel, 10);
-        setInvPeriodicBoxSizeArg(cl, pairValueKernel, 11);
-        setPeriodicBoxSizeArg(cl, pairEnergyKernel, 11);
-        setInvPeriodicBoxSizeArg(cl, pairEnergyKernel, 12);
+        setPeriodicBoxSizeArg(cl, pairValueKernel, 8);
+        setInvPeriodicBoxSizeArg(cl, pairValueKernel, 9);
+        setPeriodicBoxSizeArg(cl, pairEnergyKernel, 9);
+        setInvPeriodicBoxSizeArg(cl, pairEnergyKernel, 10);
        if (maxTiles < nb.getInteractingTiles().getSize()) {
            maxTiles = nb.getInteractingTiles().getSize();
-            pairValueKernel.setArg<cl::Buffer>(8, nb.getInteractingTiles().getDeviceBuffer());
-            pairValueKernel.setArg<cl_uint>(12, maxTiles);
-            pairEnergyKernel.setArg<cl::Buffer>(9, nb.getInteractingTiles().getDeviceBuffer());
-            pairEnergyKernel.setArg<cl_uint>(13, maxTiles);
-            if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
-                pairValueKernel.setArg<cl::Buffer>(13, nb.getInteractionFlags().getDeviceBuffer());
-                pairEnergyKernel.setArg<cl::Buffer>(14, nb.getInteractionFlags().getDeviceBuffer());
-            }
+            pairValueKernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
+            pairValueKernel.setArg<cl_uint>(11, maxTiles);
+            pairValueKernel.setArg<cl::Buffer>(12, nb.getInteractingAtoms().getDeviceBuffer());
+            pairEnergyKernel.setArg<cl::Buffer>(7, nb.getInteractingTiles().getDeviceBuffer());
+            pairEnergyKernel.setArg<cl_uint>(12, maxTiles);
+            pairEnergyKernel.setArg<cl::Buffer>(13, nb.getInteractingAtoms().getDeviceBuffer());
        }
    }
    cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
@@ -3140,11 +3137,10 @@ void OpenCLCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context
    
    // Record the per-particle parameters.
    
-    vector<vector<cl_float> > paramVector(numParticles);
+    vector<vector<cl_float> > paramVector(cl.getPaddedNumAtoms(), vector<cl_float>(force.getNumPerParticleParameters(), 0));
    vector<double> parameters;
    for (int i = 0; i < numParticles; i++) {
        force.getParticleParameters(i, parameters);
-        paramVector[i].resize(parameters.size());
        for (int j = 0; j < (int) parameters.size(); j++)
            paramVector[i][j] = (cl_float) parameters[j];
    }
@@ -4573,7 +4569,7 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
        selectSizeKernel.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer());
        selectSizeKernel.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer());
        selectSizeKernel.setArg<cl::Buffer>(7, params->getDeviceBuffer());
-	int elementSize = (useDouble ? sizeof(cl_double) : sizeof(cl_float));
+        int elementSize = (useDouble ? sizeof(cl_double) : sizeof(cl_float));
        selectSizeKernel.setArg(8, params->getSize()*elementSize, NULL);
        selectSizeKernel.setArg(9, blockSize*elementSize, NULL);
    }

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -556,7 +556,7 @@ class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
-            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
+            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
@@ -586,15 +586,15 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
 private:
-    struct SortTrait {
-        typedef mm_int2 DataType;
-        typedef cl_int KeyType;
-        static const char* clDataType() {return "int2";}
-        static const char* clKeyType() {return "int";}
-        static const char* clMinKey() {return "INT_MIN";}
-        static const char* clMaxKey() {return "INT_MAX";}
-        static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
-        static const char* clSortKey() {return "value.y";}
+    class SortTrait : public OpenCLSort::SortTrait {
+        int getDataSize() const {return 8;}
+        int getKeySize() const {return 4;}
+        const char* getDataType() const {return "int2";}
+        const char* getKeyType() const {return "int";}
+        const char* getMinKey() const {return "INT_MIN";}
+        const char* getMaxKey() const {return "INT_MAX";}
+        const char* getMaxValue() const {return "(int2) (INT_MAX, INT_MAX)";}
+        const char* getSortKey() const {return "value.y";}
    };
    OpenCLContext& cl;
    bool hasInitializedKernel;
@@ -607,10 +607,9 @@ private:
    OpenCLArray* pmeBsplineModuliY;
    OpenCLArray* pmeBsplineModuliZ;
    OpenCLArray* pmeBsplineTheta;
-    OpenCLArray* pmeBsplineDTheta;
    OpenCLArray* pmeAtomRange;
    OpenCLArray* pmeAtomGridIndex;
-    OpenCLSort<SortTrait>* sort;
+    OpenCLSort* sort;
    OpenCLFFT3D* fft;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;
@@ -625,7 +624,6 @@ private:
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
    double ewaldSelfEnergy, dispersionCoefficient, alpha;
-    int interpolateForceThreads;
    bool hasCoulomb, hasLJ;
    static const int PmeOrder = 5;
 };
@@ -775,6 +773,8 @@ private:
    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
    System& system;
    cl::Kernel pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
+    std::string pairValueSrc, pairEnergySrc;
+    std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
 };

 /**

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -29,6 +29,8 @@
 #include "OpenCLArray.h"
 #include "OpenCLKernelSources.h"
 #include "OpenCLExpressionUtilities.h"
+#include "OpenCLSort.h"
+#include <algorithm>
 #include <map>
 #include <set>
 #include <utility>
@@ -36,13 +38,29 @@
 using namespace OpenMM;
 using namespace std;

-OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false),
-        numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
-        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), nonbondedForceGroup(0) {
+class OpenCLNonbondedUtilities::BlockSortTrait : public OpenCLSort::SortTrait {
+public:
+    BlockSortTrait(bool useDouble) : useDouble(useDouble) {
+    }
+    int getDataSize() const {return useDouble ? sizeof(mm_double2) : sizeof(mm_float2);}
+    int getKeySize() const {return useDouble ? sizeof(cl_double) : sizeof(cl_float);}
+    const char* getDataType() const {return "real2";}
+    const char* getKeyType() const {return "real";}
+    const char* getMinKey() const {return "-MAXFLOAT";}
+    const char* getMaxKey() const {return "MAXFLOAT";}
+    const char* getMaxValue() const {return "(real2) (MAXFLOAT, MAXFLOAT)";}
+    const char* getSortKey() const {return "value.x";}
+private:
+    bool useDouble;
+};
+
+OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
+        numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
+        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
    // Decide how many thread blocks and force buffers to use.

    deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
-    forceBufferPerAtomBlock = false;
    if (deviceIsCpu) {
        numForceThreadBlocks = context.getNumThreadBlocks();
        forceThreadBlockSize = 1;
@@ -50,15 +68,15 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
    }
    else if (context.getSIMDWidth() == 32) {
        if (context.getSupports64BitGlobalAtomics()) {
-            numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+            numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
            forceThreadBlockSize = 256;
            // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
            numForceBuffers = 1;
        }
        else {
-            numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-            forceThreadBlockSize = 128;
-            numForceBuffers = numForceThreadBlocks;
+            numForceThreadBlocks = 3*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+            forceThreadBlockSize = 256;
+            numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
        }
    }
    else {
@@ -69,13 +87,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
            numForceBuffers = 1;
        }
        else {
-            numForceBuffers = numForceThreadBlocks;
-            if (numForceBuffers >= context.getNumAtomBlocks()) {
-                // For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
-
-                forceBufferPerAtomBlock = true;
-                numForceBuffers = context.getNumAtomBlocks();
-            }
+            numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
        }
    }
 }
@@ -85,18 +97,32 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
        delete exclusionIndices;
    if (exclusionRowIndices != NULL)
        delete exclusionRowIndices;
+    if (exclusionTiles != NULL)
+        delete exclusionTiles;
    if (exclusions != NULL)
        delete exclusions;
    if (interactingTiles != NULL)
        delete interactingTiles;
-    if (interactionFlags != NULL)
-        delete interactionFlags;
+    if (interactingAtoms != NULL)
+        delete interactingAtoms;
    if (interactionCount != NULL)
        delete interactionCount;
    if (blockCenter != NULL)
        delete blockCenter;
    if (blockBoundingBox != NULL)
        delete blockBoundingBox;
+    if (sortedBlocks != NULL)
+        delete sortedBlocks;
+    if (sortedBlockCenter != NULL)
+        delete sortedBlockCenter;
+    if (sortedBlockBoundingBox != NULL)
+        delete sortedBlockBoundingBox;
+    if (oldPositions != NULL)
+        delete oldPositions;
+    if (rebuildNeighborList != NULL)
+        delete rebuildNeighborList;
+    if (blockSorter != NULL)
+        delete blockSorter;
 }

 void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
@@ -149,6 +175,10 @@ void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exc
    }
 }

+static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) {
+    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
+}
+
 void OpenCLNonbondedUtilities::initialize(const System& system) {
    if (atomExclusions.size() == 0) {
        // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
@@ -161,14 +191,11 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    // Create the list of tiles.

    int numAtomBlocks = context.getNumAtomBlocks();
-    int totalTiles = numAtomBlocks*(numAtomBlocks+1)/2;
    int numContexts = context.getPlatformData().contexts.size();
-    startTileIndex = context.getContextIndex()*totalTiles/numContexts;
-    int endTileIndex = (context.getContextIndex()+1)*totalTiles/numContexts;
-    numTiles = endTileIndex-startTileIndex;
-
-    // Build a list of indices for the tiles with exclusions.
+    setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);

+    // Build a list of tiles that contain exclusions.
+    
    set<pair<int, int> > tilesWithExclusions;
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/OpenCLContext::TileSize;
@@ -178,19 +205,29 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
        }
    }
-    if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
-        for (int i = 0; i < numAtomBlocks; ++i)
-            tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i));
+    vector<mm_ushort2> exclusionTilesVec;
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
+        exclusionTilesVec.push_back(mm_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
+    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
+    exclusionTiles = OpenCLArray::create<mm_ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
+    exclusionTiles->upload(exclusionTilesVec);
+    map<pair<int, int>, int> exclusionTileMap;
+    for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
+        mm_ushort2 tile = exclusionTilesVec[i];
+        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
+    }
+    vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
+        exclusionBlocksForBlock[iter->first].push_back(iter->second);
+        if (iter->first != iter->second)
+            exclusionBlocksForBlock[iter->second].push_back(iter->first);
    }
    vector<cl_uint> exclusionRowIndicesVec(numAtomBlocks+1, 0);
    vector<cl_uint> exclusionIndicesVec;
-    int currentRow = 0;
-    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
-        while (iter->first != currentRow)
-            exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
-        exclusionIndicesVec.push_back(iter->second);
+    for (int i = 0; i < numAtomBlocks; i++) {
+        exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
+        exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
    }
-    exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
    exclusionIndices = OpenCLArray::create<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
    exclusionRowIndices = OpenCLArray::create<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
    exclusionIndices->upload(exclusionIndicesVec);
@@ -199,7 +236,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    // Record the exclusion data.

    exclusions = OpenCLArray::create<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
-    vector<cl_uint> exclusionVec(exclusions->getSize());
+    cl_uint allFlags = (cl_uint) -1;
+    vector<cl_uint> exclusionVec(exclusions->getSize(), allFlags);
    for (int i = 0; i < exclusions->getSize(); ++i)
        exclusionVec[i] = 0xFFFFFFFF;
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
@@ -210,31 +248,12 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
            int y = atom2/OpenCLContext::TileSize;
            int offset2 = atom2-y*OpenCLContext::TileSize;
            if (x > y) {
-                int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
-                exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
+                int index = exclusionTileMap[make_pair(x, y)]*OpenCLContext::TileSize;
+                exclusionVec[index+offset1] &= allFlags-(1<<offset2);
            }
            else {
-                int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
-                exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
-            }
-        }
-    }
-
-    // Mark all interactions that involve a padding atom as being excluded.
-
-    for (int atom1 = context.getNumAtoms(); atom1 < context.getPaddedNumAtoms(); ++atom1) {
-        int x = atom1/OpenCLContext::TileSize;
-        int offset1 = atom1-x*OpenCLContext::TileSize;
-        for (int atom2 = 0; atom2 < context.getPaddedNumAtoms(); ++atom2) {
-            int y = atom2/OpenCLContext::TileSize;
-            int offset2 = atom2-y*OpenCLContext::TileSize;
-            if (x >= y) {
-                int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
-                exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
-            }
-            if (y >= x) {
-                int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
-                exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
+                int index = exclusionTileMap[make_pair(y, x)]*OpenCLContext::TileSize;
+                exclusionVec[index+offset2] &= allFlags-(1<<offset1);
            }
        }
    }
@@ -244,21 +263,35 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    // Create data structures for the neighbor list.

    if (useCutoff) {
-        // Select a size for the arrays that hold the neighbor list.  This estimate is intentionally very
-        // high, because if it ever is too small, we have to fall back to the N^2 algorithm.
-
-        mm_float4 boxSize = context.getPeriodicBoxSize();
-        int maxInteractingTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z));
-        if (maxInteractingTiles > numTiles)
-            maxInteractingTiles = numTiles;
-        if (maxInteractingTiles < 1)
-            maxInteractingTiles = 1;
-        interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxInteractingTiles, "interactingTiles");
-        interactionFlags = OpenCLArray::create<cl_uint>(context, context.getSIMDWidth() == 32 ? maxInteractingTiles : (deviceIsCpu ? 2*maxInteractingTiles : 1), "interactionFlags");
+        // Select a size for the arrays that hold the neighbor list.  We have to make a fairly
+        // arbitrary guess, but if this turns out to be too small we'll increase it later.
+
+        int maxTiles = 20*numAtomBlocks;
+        if (maxTiles > numTiles)
+            maxTiles = numTiles;
+        if (maxTiles < 1)
+            maxTiles = 1;
+        int numAtoms = context.getNumAtoms();
+        interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxTiles, "interactingTiles");
+        interactingAtoms = OpenCLArray::create<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
        interactionCount = OpenCLArray::create<cl_uint>(context, 1, "interactionCount");
-        int elementSize = (context.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
-        blockCenter = new OpenCLArray(context, numAtomBlocks, elementSize, "blockCenter");
-        blockBoundingBox = new OpenCLArray(context, numAtomBlocks, elementSize, "blockBoundingBox");
+        int elementSize = (context.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
+        blockCenter = new OpenCLArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
+        blockBoundingBox = new OpenCLArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
+        sortedBlocks = new OpenCLArray(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
+        sortedBlockCenter = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
+        sortedBlockBoundingBox = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
+        oldPositions = new OpenCLArray(context, numAtoms, 4*elementSize, "oldPositions");
+        if (context.getUseDoublePrecision()) {
+            vector<mm_double4> oldPositionsVec(numAtoms, mm_double4(1e30, 1e30, 1e30, 0));
+            oldPositions->upload(oldPositionsVec);
+        }
+        else {
+            vector<mm_float4> oldPositionsVec(numAtoms, mm_float4(1e30f, 1e30f, 1e30f, 0));
+            oldPositions->upload(oldPositionsVec);
+        }
+        rebuildNeighborList = OpenCLArray::create<int>(context, 1, "rebuildNeighborList");
+        blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
        vector<cl_uint> count(1, 0);
        interactionCount->upload(count);
    }
@@ -268,12 +301,24 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    if (kernelSource.size() > 0)
        forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
    if (useCutoff) {
+        double padding = (usePadding ? 0.1*cutoff : 0.0);
+        double paddedCutoff = cutoff+padding;
        map<string, string> defines;
+        defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
+        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+        defines["PADDING"] = context.doubleToString(padding);
+        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
+        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
-        if (forceBufferPerAtomBlock)
-            defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
        if (usePeriodic)
            defines["USE_PERIODIC"] = "1";
+        int maxExclusions = 0;
+        for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
+            maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
+        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
+        defines["GROUP_SIZE"] = (deviceIsCpu ? "32" : "256");
+        defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
        string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
        cl::Program interactingBlocksProgram = context.createProgram(file, defines);
        findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
@@ -281,48 +326,38 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
        findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
        findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
-        findBlockBoundsKernel.setArg<cl::Buffer>(6, interactionCount->getDeviceBuffer());
+        findBlockBoundsKernel.setArg<cl::Buffer>(6, rebuildNeighborList->getDeviceBuffer());
+        findBlockBoundsKernel.setArg<cl::Buffer>(7, sortedBlocks->getDeviceBuffer());
+        sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
+        sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer());
+        sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer());
+        sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer());
+        sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer());
+        sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer());
+        sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
+        sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
+        sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
+        sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
        findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
-        if (context.getUseDoublePrecision())
-            findInteractingBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
-        else
-            findInteractingBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
-        findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
-        findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
-        findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
-        findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
-        findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
-        findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer());
-        findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles->getSize());
-        findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
-        findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
-        if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
-            findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
-            if (context.getUseDoublePrecision())
-                findInteractionsWithinBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
-            else
-                findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
-            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(5, blockCenter->getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(6, blockBoundingBox->getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(8, interactionCount->getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg(9, 128*sizeof(cl_uint), NULL);
-            findInteractionsWithinBlocksKernel.setArg<cl_uint>(10, interactingTiles->getSize());
-        }
+        findInteractingBlocksKernel.setArg<cl::Buffer>(2, blockCenter->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockBoundingBox->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(4, interactionCount->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactingTiles->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingAtoms->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(7, context.getPosq().getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl_uint>(8, interactingTiles->getSize());
+        findInteractingBlocksKernel.setArg<cl_uint>(9, startBlockIndex);
+        findInteractingBlocksKernel.setArg<cl_uint>(10, numBlocks);
+        findInteractingBlocksKernel.setArg<cl::Buffer>(11, sortedBlocks->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(12, sortedBlockCenter->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(13, sortedBlockBoundingBox->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(14, exclusionIndices->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(15, exclusionRowIndices->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(16, oldPositions->getDeviceBuffer());
+        findInteractingBlocksKernel.setArg<cl::Buffer>(17, rebuildNeighborList->getDeviceBuffer());
    }
 }

-int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector<cl_uint>& exclusionIndices, const vector<cl_uint>& exclusionRowIndices) {
-    int start = exclusionRowIndices[x];
-    int end = exclusionRowIndices[x+1];
-    for (int i = start; i < end; i++)
-        if (exclusionIndices[i] == y)
-            return i*OpenCLContext::TileSize;
-    throw OpenMMException("Internal error: exclusion in unexpected tile");
-}
-
 static void setPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
    if (cl.getUseDoublePrecision())
        kernel.setArg<mm_double4>(index, cl.getPeriodicBoxSizeDouble());
@@ -352,23 +387,22 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
    setPeriodicBoxSizeArg(context, findBlockBoundsKernel, 1);
    setInvPeriodicBoxSizeArg(context, findBlockBoundsKernel, 2);
    context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
-    setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
-    setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 2);
-    context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : -1);
-    if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
-        setPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 1);
-        setInvPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 2);
-        context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128);
-    }
+    blockSorter->sort(*sortedBlocks);
+    context.executeKernel(sortBoxDataKernel, context.getNumAtoms());
+    setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 0);
+    setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
+    context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : 256);
 }

 void OpenCLNonbondedUtilities::computeInteractions() {
    if (kernelSource.size() > 0) {
        if (useCutoff) {
-            setPeriodicBoxSizeArg(context, forceKernel, 10);
-            setInvPeriodicBoxSizeArg(context, forceKernel, 11);
+            setPeriodicBoxSizeArg(context, forceKernel, 9);
+            setInvPeriodicBoxSizeArg(context, forceKernel, 10);
        }
        context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+        if (context.getComputeForceCount() == 1)
+            updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
    }
 }

@@ -383,42 +417,52 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.

-    int newSize = (int) (1.2*pinnedInteractionCount[0]);
-    int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
-    if (newSize > numTiles)
-        newSize = numTiles;
+    int maxTiles = (int) (1.2*pinnedInteractionCount[0]);
+    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+    if (maxTiles > totalTiles)
+        maxTiles = totalTiles;
    delete interactingTiles;
-    interactingTiles = OpenCLArray::create<mm_ushort2>(context, newSize, "interactingTiles");
-    forceKernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
-    forceKernel.setArg<cl_uint>(12, newSize);
-    findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
-    findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
-    if (context.getSIMDWidth() == 32 || deviceIsCpu) {
-        delete interactionFlags;
-        interactionFlags = OpenCLArray::create<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
-        forceKernel.setArg<cl::Buffer>(13, interactionFlags->getDeviceBuffer());
-        findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
-		if (!deviceIsCpu) {
-            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg<cl_uint>(10, newSize);
-		}
+    delete interactingAtoms;
+    interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
+    interactingAtoms = NULL;
+    interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxTiles, "interactingTiles");
+    interactingAtoms = OpenCLArray::create<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
+    forceKernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
+    forceKernel.setArg<cl_uint>(11, maxTiles);
+    forceKernel.setArg<cl::Buffer>(13, interactingAtoms->getDeviceBuffer());
+    findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactingTiles->getDeviceBuffer());
+    findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingAtoms->getDeviceBuffer());
+    findInteractingBlocksKernel.setArg<cl_uint>(8, maxTiles);
+    int numAtoms = context.getNumAtoms();
+    if (context.getUseDoublePrecision()) {
+        vector<mm_double4> oldPositionsVec(numAtoms, mm_double4(1e30, 1e30, 1e30, 0));
+        oldPositions->upload(oldPositionsVec);
+    }
+    else {
+        vector<mm_float4> oldPositionsVec(numAtoms, mm_float4(1e30f, 1e30f, 1e30f, 0));
+        oldPositions->upload(oldPositionsVec);
    }
 }

-void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
-    this->startTileIndex = startTileIndex;
-    this->numTiles = numTiles;
-    if (kernelSource.size() == 0)
-        return; // There are no nonbonded interactions in the System.
-    forceKernel.setArg<cl_uint>(6, startTileIndex);
-    forceKernel.setArg<cl_uint>(7, startTileIndex+numTiles);
-    if (useCutoff) {
-        findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
-        findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
+void OpenCLNonbondedUtilities::setUsePadding(bool padding) {
+    usePadding = padding;
+}
+
+void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
+    int numAtomBlocks = context.getNumAtomBlocks();
+    startBlockIndex = (int) (startFraction*numAtomBlocks);
+    numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
+    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+    startTileIndex = (int) (startFraction*totalTiles);;
+    numTiles = (int) (endFraction*totalTiles)-startTileIndex;
+    if (useCutoff && interactingTiles != NULL) {
+        // We are using a cutoff, and the kernels have already been created.
+        
+        forceKernel.setArg<cl_uint>(5, startTileIndex);
+        forceKernel.setArg<cl_uint>(6, numTiles);
+        findInteractingBlocksKernel.setArg<cl_uint>(9, startBlockIndex);
+        findInteractingBlocksKernel.setArg<cl_uint>(10, numBlocks);
    }
-    else
-        forceKernel.setArg<cl_uint>(8, numTiles);
 }

 cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const {
@@ -510,8 +554,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    }
    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
    map<string, string> defines;
-    if (forceBufferPerAtomBlock)
-        defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
    if (useCutoff)
        defines["USE_CUTOFF"] = "1";
    if (usePeriodic)
@@ -525,15 +567,21 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+    defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
+    int numExclusionTiles = exclusionTiles->getSize();
+    defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
+    int numContexts = context.getPlatformData().contexts.size();
+    int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
+    int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
+    defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
+    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
    if ((localDataSize/4)%2 == 0)
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    string file;
    if (deviceIsCpu)
        file = OpenCLKernelSources::nonbonded_cpu;
-    else if (context.getSIMDWidth() == 32)
-        file = OpenCLKernelSources::nonbonded_nvidia;
    else
-        file = OpenCLKernelSources::nonbonded_default;
+        file = OpenCLKernelSources::nonbonded;
    cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines);
    cl::Kernel kernel(program, "computeNonbonded");

@@ -547,19 +595,16 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
-    kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer());
-    kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
+    kernel.setArg<cl::Buffer>(index++, exclusionTiles->getDeviceBuffer());
    kernel.setArg<cl_uint>(index++, startTileIndex);
-    kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
+    kernel.setArg<cl_uint>(index++, numTiles);
    if (useCutoff) {
        kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
        index += 2; // The periodic box size arguments are set when the kernel is executed.
        kernel.setArg<cl_uint>(index++, interactingTiles->getSize());
-        kernel.setArg<cl::Buffer>(index++, interactionFlags->getDeviceBuffer());
-    }
-    else {
-        kernel.setArg<cl_uint>(index++, numTiles);
+        kernel.setArg<cl::Buffer>(index++, blockCenter->getDeviceBuffer());
+        kernel.setArg<cl::Buffer>(index++, interactingAtoms->getDeviceBuffer());
    }
    for (int i = 0; i < (int) params.size(); i++) {
        kernel.setArg<cl::Memory>(index++, params[i].getMemory());

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2010 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,6 +35,8 @@
 #include <vector>

 namespace OpenMM {
+    
+class OpenCLSort;

 /**
 * This class provides a generic interface for calculating nonbonded interactions.  It does this in two
@@ -120,12 +122,6 @@ public:
    bool getUsePeriodic() {
        return usePeriodic;
    }
-    /**
-     * Get whether there is one force buffer per atom block.
-     */
-    bool getForceBufferPerAtomBlock() {
-        return forceBufferPerAtomBlock;
-    }
    /**
     * Get the number of work groups used for computing nonbonded forces.
     */
@@ -193,10 +189,10 @@ public:
        return *interactingTiles;
    }
    /**
-     * Get the array containing flags for tiles with interactions.
+     * Get the array containing the atoms in each tile with interactions.
     */
-    OpenCLArray& getInteractionFlags() {
-        return *interactionFlags;
+    OpenCLArray& getInteractingAtoms() {
+        return *interactingAtoms;
    }
    /**
     * Get the array containing exclusion flags.
@@ -204,6 +200,12 @@ public:
    OpenCLArray& getExclusions() {
        return *exclusions;
    }
+    /**
+     * Get the array containing tiles with exclusions.
+     */
+    OpenCLArray& getExclusionTiles() {
+        return *exclusionTiles;
+    }
    /**
     * Get the array containing the index into the exclusion array for each tile.
     */
@@ -229,9 +231,17 @@ public:
        return numTiles;
    }
    /**
-     * Set the range of tiles that should be processed by this context.
+     * Set whether to add padding to the cutoff distance when building the neighbor list.
+     * This increases the size of the neighbor list (and thus the cost of computing interactions),
+     * but also means we don't need to rebuild it every time step.  The default value is true,
+     * since usually this improves performance.  For very expensive interactions, however,
+     * it may be better to set this to false.
+     */
+    void setUsePadding(bool padding);
+    /**
+     * Set the range of atom blocks and tiles that should be processed by this context.
     */
-    void setTileRange(int startTileIndex, int numTiles);
+    void setAtomBlockRange(double startFraction, double endFraction);
    /**
     * Create a Kernel for evaluating a nonbonded interaction.  Cutoffs and periodic boundary conditions
     * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
@@ -245,28 +255,36 @@ public:
     */
    cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const;
 private:
-    static int findExclusionIndex(int x, int y, const std::vector<cl_uint>& exclusionIndices, const std::vector<cl_uint>& exclusionRowIndices);
+    class BlockSortTrait;
    OpenCLContext& context;
    cl::Kernel forceKernel;
    cl::Kernel findBlockBoundsKernel;
+    cl::Kernel sortBoxDataKernel;
    cl::Kernel findInteractingBlocksKernel;
    cl::Kernel findInteractionsWithinBlocksKernel;
+    OpenCLArray* exclusionTiles;
    OpenCLArray* exclusions;
    OpenCLArray* exclusionIndices;
    OpenCLArray* exclusionRowIndices;
    OpenCLArray* interactingTiles;
-    OpenCLArray* interactionFlags;
+    OpenCLArray* interactingAtoms;
    OpenCLArray* interactionCount;
    OpenCLArray* blockCenter;
    OpenCLArray* blockBoundingBox;
+    OpenCLArray* sortedBlocks;
+    OpenCLArray* sortedBlockCenter;
+    OpenCLArray* sortedBlockBoundingBox;
+    OpenCLArray* oldPositions;
+    OpenCLArray* rebuildNeighborList;
+    OpenCLSort* blockSorter;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;
    std::string kernelSource;
    std::map<std::string, std::string> kernelDefines;
    double cutoff;
-    bool useCutoff, usePeriodic, forceBufferPerAtomBlock, deviceIsCpu, anyExclusions;
-    int numForceBuffers, startTileIndex, numTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup;
+    bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding;
+    int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup;
 };

 /**

--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -108,7 +108,7 @@ private:
 };

 OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
-        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL),
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
        pinnedPositionBuffer(NULL), pinnedPositionMemory(NULL), pinnedForceBuffer(NULL), pinnedForceMemory(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new OpenCLCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
@@ -126,6 +126,8 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne
 void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system);
+    for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
+        contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
 }

 void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -172,30 +174,26 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
                numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
        cl.reduceBuffer(*contextForces, data.contexts.size());
        
-        // Balance work between the contexts by transferring a few nonbonded tiles from the context that
+        // Balance work between the contexts by transferring a little nonbonded work from the context that
        // finished last to the one that finished first.
        
        int firstIndex = 0, lastIndex = 0;
-        int totalTiles = 0;
        for (int i = 0; i < (int) completionTimes.size(); i++) {
            if (completionTimes[i] < completionTimes[firstIndex])
                firstIndex = i;
            if (completionTimes[i] > completionTimes[lastIndex])
                lastIndex = i;
-            contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
-            totalTiles += contextTiles[i];
        }
-        int tilesToTransfer = totalTiles/1000;
-        if (tilesToTransfer < 1)
-            tilesToTransfer = 1;
-        if (tilesToTransfer > contextTiles[lastIndex])
-            tilesToTransfer = contextTiles[lastIndex];
-        contextTiles[firstIndex] += tilesToTransfer;
-        contextTiles[lastIndex] -= tilesToTransfer;
-        int startIndex = 0;
-        for (int i = 0; i < (int) contextTiles.size(); i++) {
-            data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
-            startIndex += contextTiles[i];
+        double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
+        contextNonbondedFractions[firstIndex] += fractionToTransfer;
+        contextNonbondedFractions[lastIndex] -= fractionToTransfer;
+        double startFraction = 0.0;
+        for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
+            double endFraction = startFraction+contextNonbondedFractions[i];
+            if (i == contextNonbondedFractions.size()-1)
+                endFraction = 1.0; // Avoid roundoff error
+            data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
+            startFraction = endFraction;
        }
    }
    return energy;

--- a/platforms/opencl/src/OpenCLParallelKernels.h
+++ b/platforms/opencl/src/OpenCLParallelKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -80,7 +80,7 @@ private:
    OpenCLPlatform::PlatformData& data;
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
-    std::vector<int> contextTiles;
+    std::vector<double> contextNonbondedFractions;
    OpenCLArray* contextForces;
    cl::Buffer* pinnedPositionBuffer;
    cl::Buffer* pinnedForceBuffer;