Merge pull request #609 from peastman/nb

Rewrote the neighbor list generation kernel to be simpler and (slightly) faster

Merge pull request #609 from peastman/nb
Rewrote the neighbor list generation kernel to be simpler and (slightly) faster
80be998e · peastman · f9e24fea · 68984320 · 80be998e · 80be998e
Commit 80be998e authored Sep 09, 2014 by peastman
4 changed files
--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
 #define GROUP_SIZE 256
-#define BUFFER_GROUPS 2
+#define BUFFER_SIZE 256
-#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
-#define WARP_SIZE 32
-#define INVALID 0xFFFF
 /**
 * Find a bounding box for the atoms in each block.
@@ -70,220 +67,27 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
    }
 }
-/**
- * Perform a parallel prefix sum over an array.  The input values are all assumed to be 0 or 1.
- */
-__device__ void prefixSum(short* sum, ushort2* temp) {
-#if __CUDA_ARCH__ >= 300
-    const int indexInWarp = threadIdx.x%WARP_SIZE;
-    const int warpMask = (2<<indexInWarp)-1;
-    for (int base = 0; base < BUFFER_SIZE; base += blockDim.x)
-        temp[base+threadIdx.x].x = __popc(__ballot(sum[base+threadIdx.x])&warpMask);
-    __syncthreads();
-    if (threadIdx.x < BUFFER_SIZE/WARP_SIZE) {
-        int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
-        for (int offset = 1; offset < BUFFER_SIZE/WARP_SIZE; offset *= 2) {
-            short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
-            if (indexInWarp >= offset)
-                multiWarpSum += n;
-        }
-        temp[threadIdx.x].y = multiWarpSum;
-    }
-    __syncthreads();
-    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-        sum[i] = temp[i].x+(i < WARP_SIZE ? 0 : temp[i/WARP_SIZE-1].y);
-    __syncthreads();
-#else
-    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-        temp[i].x = sum[i];
-    __syncthreads();
-    int whichBuffer = 0;
-    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
-        if (whichBuffer == 0)
-            for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
-        else
-            for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
-        whichBuffer = 1-whichBuffer;
-        __syncthreads();
-    }
-    if (whichBuffer == 0)
-        for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-            sum[i] = temp[i].x;
-    else
-        for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-            sum[i] = temp[i].y;
-    __syncthreads();
-#endif
-}
-/**
- * This is called by findBlocksWithInteractions(). It compacts the list of blocks, identifies interactions
- * in them, and writes the result to global memory.
- */
-__device__ void storeInteractionData(int x, unsigned short* buffer, short* sum, ushort2* temp, int* atoms, int& numAtoms,
-            int& baseIndex, unsigned int* interactionCount, int* interactingTiles, unsigned int* interactingAtoms, real4 periodicBoxSize,
-            real4 invPeriodicBoxSize, const real4* posq, real3* posBuffer, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
-    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
-                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
-                                     0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
-    if (threadIdx.x < TILE_SIZE) {
-        real3 pos = trimTo3(posq[x*TILE_SIZE+threadIdx.x]);
-        posBuffer[threadIdx.x] = pos;
-#ifdef USE_PERIODIC
-        if (singlePeriodicCopy) {
-            // The box is small enough that we can just translate all the atoms into a single periodic
-            // box, then skip having to apply periodic boundary conditions later.
-            pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-            pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-            pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-            posBuffer[threadIdx.x] = pos;
-        }
-#endif
-    }
-    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.
-    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-        sum[i] = (buffer[i] == INVALID ? 0 : 1);
-    __syncthreads();
-    prefixSum(sum, temp);
-    int numValid = sum[BUFFER_SIZE-1];
-    // Compact the buffer.
-    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-        if (buffer[i] != INVALID)
-            temp[sum[i]-1].x = buffer[i];
-    __syncthreads();
-    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-        buffer[i] = temp[i].x;
-    __syncthreads();
-    // Loop over the tiles and find specific interactions in them.
-    const int indexInWarp = threadIdx.x%WARP_SIZE;
-    for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
-        for (int i = threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
-            // Check each atom in block Y for interactions.
-            real3 pos = trimTo3(posq[buffer[base+i]*TILE_SIZE+indexInWarp]);
-#ifdef USE_PERIODIC
-            if (singlePeriodicCopy) {
-                pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-            }
-#endif
-            bool interacts = false;
-#ifdef USE_PERIODIC
-            if (!singlePeriodicCopy) {
-                for (int j = 0; j < TILE_SIZE; j++) {
-                    real3 delta = pos-posBuffer[j];
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
-                }
-            }
-            else {
-#endif
-                for (int j = 0; j < TILE_SIZE; j++) {
-                    real3 delta = pos-posBuffer[j];
-                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
-                }
-#ifdef USE_PERIODIC
-            }
-#endif
-            sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
-        }
-        for (int i = numValid-base+threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
-            sum[i*WARP_SIZE+indexInWarp] = 0;
-        // Compact the list of atoms.
-        __syncthreads();
-        prefixSum(sum, temp);
-        for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-            if (sum[i] != (i == 0 ? 0 : sum[i-1]))
-                atoms[numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
-        // Store them to global memory.
-        int atomsToStore = numAtoms+sum[BUFFER_SIZE-1];
-        bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
-        int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
-        if (tilesToStore > 0) {
-            if (threadIdx.x == 0)
-                baseIndex = atomicAdd(interactionCount, tilesToStore);
-            __syncthreads();
-            if (threadIdx.x == 0)
-                numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
-            if (baseIndex+tilesToStore <= maxTiles) {
-                if (threadIdx.x < tilesToStore)
-                    interactingTiles[baseIndex+threadIdx.x] = x;
-                for (int i = threadIdx.x; i < tilesToStore*TILE_SIZE; i += blockDim.x)
-                    interactingAtoms[baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
-            }
-        }
-        else {
-            __syncthreads();
-            if (threadIdx.x == 0)
-                numAtoms += sum[BUFFER_SIZE-1];
-        }
-        __syncthreads();
-        if (threadIdx.x < numAtoms && !storePartialTile)
-            atoms[threadIdx.x] = atoms[tilesToStore*TILE_SIZE+threadIdx.x];
-    }
-    if (numValid == 0 && numAtoms > 0 && finish) {
-        // We didn't have any more tiles to process, but there were some atoms left over from a
-        // previous call to this function.  Save them now.
-        if (threadIdx.x == 0)
-            baseIndex = atomicAdd(interactionCount, 1);
-        __syncthreads();
-        if (baseIndex < maxTiles) {
-            if (threadIdx.x == 0)
-                interactingTiles[baseIndex] = x;
-            if (threadIdx.x < TILE_SIZE)
-                interactingAtoms[baseIndex*TILE_SIZE+threadIdx.x] = (threadIdx.x < numAtoms ? atoms[threadIdx.x] : NUM_ATOMS);
-        }
-    }
-    // Reset the buffer for processing more tiles.
-    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-        buffer[i] = INVALID;
-    __syncthreads();
-}
 /**
 * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
 * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
 *
 * STAGE 1:
 *
- * A coarse grain atomblock against interacting atomblock neighbourlist is constructed. 
+ * A coarse grained atom block against interacting atom block neighbour list is constructed. 
 *
- * Each threadblock first loads in some block X of interest. Each thread within the threadblock then loads 
+ * Each warp first loads in some block X of interest. Each thread within the warp then loads 
- * in a different atomblock Y. If Y has exclusions with X, then Y is not processed.  If the bounding boxes 
+ * in a different atom block Y. If Y has exclusions with X, then Y is not processed.  If the bounding boxes 
- * of the two atomblocks are within the cutoff distance, then the two atomblocks are considered to be
+ * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
- * interacting and Y is added to the buffer for X. If during any given iteration an atomblock (or thread) 
+ * interacting and Y is added to the buffer for X.
- * finds BUFFER_GROUP interacting blocks, the entire buffer is sent for compaction by storeInteractionData().
 *
 * STAGE 2:
 *
- * A fine grain atomblock against interacting atoms neighbourlist is constructed.
+ * A fine grained atom block against interacting atoms neighbour list is constructed.
 *
- * The input is an atomblock list detailing the interactions with other atomblocks. The list of interacting 
+ * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X.  Each thread
- * atom blocks are initially stored in the buffer array in shared memory. buffer is then compacted using 
+ * in the warp loops over the 32 atoms in X and compares their positions to one particular atom from block Y.
- * prefixSum. Afterwards, each threadblock processes one contiguous atomblock X. Each warp in a threadblock 
+ * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
- * processes a block Y to find the atoms that interact with any given atom in X. Once BUFFER_SIZE/WARP_SIZE 
+ * This continues until the buffer fills up, at which point the results are written to global memory.
- * (eg. 16) atomblocks have been processed for a given X, the list of interacting atoms in these 16 blocks 
- * are subsequently compacted. The process repeats until all atomblocks that interact with X are computed.
 *
 * [in] periodicBoxSize        - size of the rectangular periodic box
 * [in] invPeriodicBoxSize     - inverse of the periodic box
@@ -317,87 +121,167 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
        unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const real4* __restrict__ sortedBlockBoundingBox,
        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions,
        const int* __restrict__ rebuildNeighborList) {
-    __shared__ unsigned short buffer[BUFFER_SIZE];
-    __shared__ short sum[BUFFER_SIZE];
-    __shared__ ushort2 temp[BUFFER_SIZE];
-    __shared__ int atoms[BUFFER_SIZE+TILE_SIZE];
-    __shared__ real3 posBuffer[TILE_SIZE];
-    __shared__ int exclusionsForX[MAX_EXCLUSIONS];
-    __shared__ int bufferFull;
-    __shared__ int globalIndex;
-    __shared__ int numAtoms;
    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.
-    int valuesInBuffer = 0;
+    const int indexInWarp = threadIdx.x%32;
-    if (threadIdx.x == 0)
+    const int warpStart = threadIdx.x-indexInWarp;
-        bufferFull = false;
+    const int totalWarps = blockDim.x*gridDim.x/32;
-    for (int i = 0; i < BUFFER_GROUPS; ++i)
+    const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
-        buffer[i*GROUP_SIZE+threadIdx.x] = INVALID;
+    const int warpMask = (1<<indexInWarp)-1;
-    __syncthreads();
+    __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
+    __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
-    // Loop over blocks sorted by size.
+    __shared__ real3 posBuffer[GROUP_SIZE];
+    __shared__ volatile int workgroupTileIndex[GROUP_SIZE/32];
-    for (int i = startBlockIndex+blockIdx.x; i < startBlockIndex+numBlocks; i += gridDim.x) {
+    int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
-        if (threadIdx.x == blockDim.x-1)
+    int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
-            numAtoms = 0;
+    volatile int& tileStartIndex = workgroupTileIndex[warpStart/32];
-        real2 sortedKey = sortedBlocks[i];
+    // Loop over blocks.
+    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
+        // Load data for this block.  Note that all threads in a warp are processing the same block.
+        real2 sortedKey = sortedBlocks[block1];
        int x = (int) sortedKey.y;
-        real4 blockCenterX = sortedBlockCenter[i];
+        real4 blockCenterX = sortedBlockCenter[block1];
-        real4 blockSizeX = sortedBlockBoundingBox[i];
+        real4 blockSizeX = sortedBlockBoundingBox[block1];
+        int neighborsInBuffer = 0;
+        real3 pos1 = trimTo3(posq[x*TILE_SIZE+indexInWarp]);
+#ifdef USE_PERIODIC
+        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
+                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
+                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
+        if (singlePeriodicCopy) {
+            // The box is small enough that we can just translate all the atoms into a single periodic
+            // box, then skip having to apply periodic boundary conditions later.
+            pos1.x -= floor((pos1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            pos1.y -= floor((pos1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            pos1.z -= floor((pos1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+        }
+#endif
+        posBuffer[threadIdx.x] = pos1;
        // Load exclusion data for block x.
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
-        for (int j = threadIdx.x; j < numExclusions; j += blockDim.x)
+        for (int j = indexInWarp; j < numExclusions; j += 32)
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
+        if (MAX_EXCLUSIONS > 32)
            __syncthreads();
-        // Compare it to other blocks after this one in sorted order.
+        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
+        // other blocks in parallel.
+        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
+            int block2 = block2Base+indexInWarp;
+            bool includeBlock2 = (block2 < NUM_BLOCKS);
+            if (includeBlock2) {
+                real4 blockCenterY = (block2 < NUM_BLOCKS ? sortedBlockCenter[block2] : make_real4(0));
+                real4 blockSizeY = (block2 < NUM_BLOCKS ? sortedBlockBoundingBox[block2] : make_real4(0));
+                real4 blockDelta = blockCenterX-blockCenterY;
+#ifdef USE_PERIODIC
+                blockDelta.x -= floor(blockDelta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                blockDelta.y -= floor(blockDelta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                blockDelta.z -= floor(blockDelta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
+                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
+                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
+                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
+                if (includeBlock2) {
+                    unsigned short y = (unsigned short) sortedBlocks[block2].y;
+                    for (int k = 0; k < numExclusions; k++)
+                        includeBlock2 &= (exclusionsForX[k] != y);
+                }
+            }
+            // Loop over any blocks we identified as potentially containing neighbors.
+            int includeBlockFlags = __ballot(includeBlock2);
+            while (includeBlockFlags != 0) {
+                int i = __ffs(includeBlockFlags)-1;
+                includeBlockFlags &= includeBlockFlags-1;
+                unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;
-        for (int base = i+1; base < NUM_BLOCKS; base += blockDim.x) {
+                // Check each atom in block Y for interactions.
-            int j = base+threadIdx.x;
-            real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : make_real2(0));
+                int start = y*TILE_SIZE;
-            real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : make_real4(0));
+                int atom2 = start+indexInWarp;
-            real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : make_real4(0));
+                real3 pos2 = trimTo3(posq[atom2]);
-            unsigned short y = (unsigned short) sortedKey2.y;
-            real4 delta = blockCenterX-blockCenterY;
 #ifdef USE_PERIODIC
+                if (singlePeriodicCopy) {
+                    pos2.x -= floor((pos2.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    pos2.y -= floor((pos2.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    pos2.z -= floor((pos2.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                }
+#endif
+                bool interacts = false;
+                if (atom2 < NUM_ATOMS) {
+#ifdef USE_PERIODIC
+                    if (!singlePeriodicCopy) {
+                        for (int j = 0; j < TILE_SIZE; j++) {
+                            real3 delta = pos2-posBuffer[warpStart+j];
                            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                        }
+                    }
+                    else {
 #endif
-            delta.x = max(0.0f, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
+                        for (int j = 0; j < TILE_SIZE; j++) {
-            delta.y = max(0.0f, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
+                            real3 delta = pos2-posBuffer[warpStart+j];
-            delta.z = max(0.0f, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
+                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
-            bool hasExclusions = false;
+                        }
-            for (int k = 0; k < numExclusions; k++)
+#ifdef USE_PERIODIC
-                hasExclusions |= (exclusionsForX[k] == y);
+                    }
-            if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
+#endif
-                // Add this tile to the buffer.
+                }
-                int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x;
-                buffer[bufferIndex] = y;
-                valuesInBuffer++;
-                // cuda-memcheck --tool racecheck will throw errors about this as 
+                // Add any interacting atoms to the buffer.
-                // RAW/WAW/WAR race condition errors. But this is safe in all instances
-                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
+                int includeAtomFlags = __ballot(interacts);
-                    bufferFull = true;
+                if (interacts)
+                    buffer[neighborsInBuffer+__popc(includeAtomFlags&warpMask)] = atom2;
+                neighborsInBuffer += __popc(includeAtomFlags);
+                if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
+                    // Store the new tiles to memory.
+                    int tilesToStore = neighborsInBuffer/TILE_SIZE;
+                    if (indexInWarp == 0)
+                        tileStartIndex = atomicAdd(interactionCount, tilesToStore);
+                    int newTileStartIndex = tileStartIndex;
+                    if (newTileStartIndex+tilesToStore < maxTiles) {
+                        if (indexInWarp < tilesToStore)
+                            interactingTiles[newTileStartIndex+indexInWarp] = x;
+                        for (int j = 0; j < tilesToStore; j++)
+                            interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                    }
-            __syncthreads();
+                    buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
-            if (bufferFull) {
+                    neighborsInBuffer -= TILE_SIZE*tilesToStore;
-                storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
+                }
-                valuesInBuffer = 0;
+            }
-                if (threadIdx.x == 0)
+        }
-                    bufferFull = false;
+        // If we have a partially filled buffer,  store it to memory.
+        if (neighborsInBuffer > 0) {
+            int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
+            if (indexInWarp == 0)
+                tileStartIndex = atomicAdd(interactionCount, tilesToStore);
+            int newTileStartIndex = tileStartIndex;
+            if (newTileStartIndex+tilesToStore < maxTiles) {
+                if (indexInWarp < tilesToStore)
+                    interactingTiles[newTileStartIndex+indexInWarp] = x;
+                for (int j = 0; j < tilesToStore; j++)
+                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
            }
-            __syncthreads();
        }
-        storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
    }
    // Record the positions the neighbor list is based on.

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -471,21 +471,25 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
    if (useDoublePrecision) {
        src << "typedef double real;\n";
        src << "typedef double2 real2;\n";
+        src << "typedef double3 real3;\n";
        src << "typedef double4 real4;\n";
    }
    else {
        src << "typedef float real;\n";
        src << "typedef float2 real2;\n";
+        src << "typedef float3 real3;\n";
        src << "typedef float4 real4;\n";
    }
    if (useDoublePrecision || useMixedPrecision) {
        src << "typedef double mixed;\n";
        src << "typedef double2 mixed2;\n";
+        src << "typedef double3 mixed3;\n";
        src << "typedef double4 mixed4;\n";
    }
    else {
        src << "typedef float mixed;\n";
        src << "typedef float2 mixed2;\n";
+        src << "typedef float3 mixed3;\n";
        src << "typedef float4 mixed4;\n";
    }
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -319,7 +319,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
        defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
        string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
-        int groupSize = (deviceIsCpu ? 32 : 128);
+        int groupSize = (deviceIsCpu || context.getSIMDWidth() < 32 ? 32 : 256);
        while (true) {
            defines["GROUP_SIZE"] = context.intToString(groupSize);
            cl::Program interactingBlocksProgram = context.createProgram(file, defines);

--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
-#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
+#define BUFFER_SIZE 256
-#define WARP_SIZE 32
-#define INVALID 0xFFFF
 /**
 * Find a bounding box for the atoms in each block.
@@ -67,92 +65,115 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
    }
 }
-/**
+__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
- * Perform a parallel prefix sum over an array.  The input values are all assumed to be 0 or 1.
+        __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
- */
+        unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
-void prefixSum(__local short* sum, __local ushort2* temp) {
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
-    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        __global const int* restrict rebuildNeighborList) {
-        temp[i].x = sum[i];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    int whichBuffer = 0;
-    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
-        if (whichBuffer == 0)
-            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
-                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
-        else
-            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
-                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
-        whichBuffer = 1-whichBuffer;
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (whichBuffer == 0)
-        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
-            sum[i] = temp[i].x;
-    else
-        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
-            sum[i] = temp[i].y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-/**
+    if (rebuildNeighborList[0] == 0)
- * This is called by findBlocksWithInteractions().  It compacts the list of blocks, identifies interactions
+        return; // The neighbor list doesn't need to be rebuilt.
- * in them, and writes the result to global memory.
- */
+    const int indexInWarp = get_local_id(0)%32;
-void storeInteractionData(int x, __local unsigned short* buffer, __local short* sum, __local ushort2* temp, __local int* atoms, __local int* numAtoms,
+    const int warpStart = get_local_id(0)-indexInWarp;
-            __local int* baseIndex, __global unsigned int* interactionCount, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize,
+    const int totalWarps = get_global_size(0)/32;
-            real4 invPeriodicBoxSize, __global const real4* posq, __local real4* posBuffer, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
+    const int warpIndex = get_global_id(0)/32;
+    const int warpMask = (1<<indexInWarp)-1;
+    __local int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
+    __local int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
+    __local real3 posBuffer[GROUP_SIZE];
+    __local volatile int workgroupTileIndex[GROUP_SIZE/32];
+    __local bool includeBlockFlags[GROUP_SIZE];
+    __local short2 atomCountBuffer[GROUP_SIZE];
+    __local int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
+    __local int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
+    __local volatile int* tileStartIndex = workgroupTileIndex+(warpStart/32);
+    // Loop over blocks.
+    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
+        // Load data for this block.  Note that all threads in a warp are processing the same block.
+        real2 sortedKey = sortedBlocks[block1];
+        int x = (int) sortedKey.y;
+        real4 blockCenterX = sortedBlockCenter[block1];
+        real4 blockSizeX = sortedBlockBoundingBox[block1];
+        int neighborsInBuffer = 0;
+        real3 pos1 = posq[x*TILE_SIZE+indexInWarp].xyz;
+#ifdef USE_PERIODIC
        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
-    if (get_local_id(0) < TILE_SIZE) {
-        real4 pos = posq[x*TILE_SIZE+get_local_id(0)];
-#ifdef USE_PERIODIC
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
-            pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+            pos1.xyz -= floor((pos1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
        }
 #endif
-        posBuffer[get_local_id(0)] = pos;
+        posBuffer[get_local_id(0)] = pos1;
-    }
-    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.
+        // Load exclusion data for block x.
+        const int exclusionStart = exclusionRowIndices[x];
+        const int exclusionEnd = exclusionRowIndices[x+1];
+        const int numExclusions = exclusionEnd-exclusionStart;
+        for (int j = indexInWarp; j < numExclusions; j += 32)
+            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
+        if (MAX_EXCLUSIONS > 32)
            barrier(CLK_LOCAL_MEM_FENCE);
-    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        else
-        sum[i] = (buffer[i] == INVALID ? 0 : 1);
+            SYNC_WARPS;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    prefixSum(sum, temp);
+        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
-    int numValid = sum[BUFFER_SIZE-1];
+        // other blocks in parallel.
-    // Compact the buffer.
+        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
+            int block2 = block2Base+indexInWarp;
+            bool includeBlock2 = (block2 < NUM_BLOCKS);
+            if (includeBlock2) {
+                real4 blockCenterY = (block2 < NUM_BLOCKS ? sortedBlockCenter[block2] : (real4) (0));
+                real4 blockSizeY = (block2 < NUM_BLOCKS ? sortedBlockBoundingBox[block2] : (real4) (0));
+                real4 blockDelta = blockCenterX-blockCenterY;
+#ifdef USE_PERIODIC
+                blockDelta.xyz -= floor(blockDelta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
+                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
+                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
+                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
+                if (includeBlock2) {
+                    unsigned short y = (unsigned short) sortedBlocks[block2].y;
+                    for (int k = 0; k < numExclusions; k++)
+                        includeBlock2 &= (exclusionsForX[k] != y);
+                }
+            }
-    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+            // Loop over any blocks we identified as potentially containing neighbors.
-        if (buffer[i] != INVALID)
-            temp[sum[i]-1].x = buffer[i];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
-        buffer[i] = temp[i].x;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    // Loop over the tiles and find specific interactions in them.
+            includeBlockFlags[get_local_id(0)] = includeBlock2;
+            SYNC_WARPS;
+            for (int i = 0; i < TILE_SIZE; i++) {
+                while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
+                    i++;
+                if (i < TILE_SIZE) {
+                    unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;
-    const int indexInWarp = get_local_id(0)%WARP_SIZE;
-    for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
-        for (int i = get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
                    // Check each atom in block Y for interactions.
-            real4 pos = posq[buffer[base+i]*TILE_SIZE+indexInWarp];
+                    int start = y*TILE_SIZE;
+                    int atom2 = start+indexInWarp;
+                    real3 pos2 = posq[atom2].xyz;
 #ifdef USE_PERIODIC
                    if (singlePeriodicCopy)
-                pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                        pos2.xyz -= floor((pos2.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
                    bool interacts = false;
+                    if (atom2 < NUM_ATOMS) {
 #ifdef USE_PERIODIC
                        if (!singlePeriodicCopy) {
                            for (int j = 0; j < TILE_SIZE; j++) {
-                    real4 delta = pos-posBuffer[j];
+                                real3 delta = pos2-posBuffer[warpStart+j];
                                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
                                interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                            }
@@ -160,170 +181,69 @@ void storeInteractionData(int x, __local unsigned short* buffer, __local short*
                        else {
 #endif
                            for (int j = 0; j < TILE_SIZE; j++) {
-                    real4 delta = pos-posBuffer[j];
+                                real3 delta = pos2-posBuffer[warpStart+j];
                                interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                            }
 #ifdef USE_PERIODIC
                        }
 #endif
-            sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
                    }
-        for (int i = numValid-base+get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
-            sum[i*WARP_SIZE+indexInWarp] = 0;
-        // Compact the list of atoms.
+                    // Do a prefix sum to compact the list of atoms.
-        barrier(CLK_LOCAL_MEM_FENCE);
+                    atomCountBuffer[get_local_id(0)].x = (interacts ? 1 : 0);
-        prefixSum(sum, temp);
+                    SYNC_WARPS;
-        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+                    int whichBuffer = 0;
-            if (sum[i] != (i == 0 ? 0 : sum[i-1]))
+                    for (int offset = 1; offset < TILE_SIZE; offset *= 2) {
-                atoms[*numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
+                        if (whichBuffer == 0)
+                            atomCountBuffer[get_local_id(0)].y = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].x : atomCountBuffer[get_local_id(0)].x+atomCountBuffer[get_local_id(0)-offset].x);
+                        else
+                            atomCountBuffer[get_local_id(0)].x = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].y : atomCountBuffer[get_local_id(0)].y+atomCountBuffer[get_local_id(0)-offset].y);
+                        whichBuffer = 1-whichBuffer;
+                        SYNC_WARPS;
+                    }
-        // Store them to global memory.
+                    // Add any interacting atoms to the buffer.
-        int atomsToStore = *numAtoms+sum[BUFFER_SIZE-1];
+                    if (interacts)
-        bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
+                        buffer[neighborsInBuffer+atomCountBuffer[get_local_id(0)].y-1] = atom2;
-        int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
+                    neighborsInBuffer += atomCountBuffer[warpStart+TILE_SIZE-1].y;
-        if (tilesToStore > 0) {
+                    if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
-            if (get_local_id(0) == 0)
+                        // Store the new tiles to memory.
-                *baseIndex = atom_add(interactionCount, tilesToStore);
-            barrier(CLK_LOCAL_MEM_FENCE);
+                        int tilesToStore = neighborsInBuffer/TILE_SIZE;
-            if (get_local_id(0) == 0)
+                        if (indexInWarp == 0)
-                *numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
+                            *tileStartIndex = atom_add(interactionCount, tilesToStore);
-            if (*baseIndex+tilesToStore <= maxTiles) {
+                        SYNC_WARPS;
-                if (get_local_id(0) < tilesToStore)
+                        int newTileStartIndex = *tileStartIndex;
-                    interactingTiles[*baseIndex+get_local_id(0)] = x;
+                        if (newTileStartIndex+tilesToStore < maxTiles) {
-                for (int i = get_local_id(0); i < tilesToStore*TILE_SIZE; i += get_local_size(0))
+                            if (indexInWarp < tilesToStore)
-                    interactingAtoms[*baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
+                                interactingTiles[newTileStartIndex+indexInWarp] = x;
-            }
+                            for (int j = 0; j < tilesToStore; j++)
+                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                        }
-        else {
+                        buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
-            barrier(CLK_LOCAL_MEM_FENCE);
+                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
-            if (get_local_id(0) == 0)
-                *numAtoms += sum[BUFFER_SIZE-1];
                   }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (get_local_id(0) < *numAtoms && !storePartialTile)
-            atoms[get_local_id(0)] = atoms[tilesToStore*TILE_SIZE+get_local_id(0)];
                }
-    if (numValid == 0 && *numAtoms > 0 && finish) {
-        // We didn't have any more tiles to process, but there were some atoms left over from a
-        // previous call to this function.  Save them now.
-        if (get_local_id(0) == 0)
-            *baseIndex = atom_add(interactionCount, 1);
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (*baseIndex < maxTiles) {
-            if (get_local_id(0) == 0)
-                interactingTiles[*baseIndex] = x;
-            if (get_local_id(0) < TILE_SIZE)
-                interactingAtoms[*baseIndex*TILE_SIZE+get_local_id(0)] = (get_local_id(0) < *numAtoms ? atoms[get_local_id(0)] : NUM_ATOMS);
            }
        }
-    // Reset the buffer for processing more tiles.
+        // If we have a partially filled buffer,  store it to memory.
-    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
-        buffer[i] = INVALID;
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-/**
- * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
- * mark them as non-interacting.
- */
-__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
-        __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
-        unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
-        __global const int* restrict rebuildNeighborList) {
-    __local unsigned short buffer[BUFFER_SIZE];
-    __local short sum[BUFFER_SIZE];
-    __local ushort2 temp[BUFFER_SIZE];
-    __local int atoms[BUFFER_SIZE+TILE_SIZE];
-    __local real4 posBuffer[TILE_SIZE];
-    __local int exclusionsForX[MAX_EXCLUSIONS];
-    __local int bufferFull;
-    __local int globalIndex;
-    __local int numAtoms;
-#ifdef AMD_ATOMIC_WORK_AROUND
-    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
-    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
-    // The IF actually causes the write to never be executed, its presence is all that is needed.
-    // AMD APP SDK 2.4 has this problem.
-    if (get_global_id(0) == get_local_id(0)+1)
-        ((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
-#endif
-    if (rebuildNeighborList[0] == 0)
-        return; // The neighbor list doesn't need to be rebuilt.
-    int valuesInBuffer = 0;
-    if (get_local_id(0) == 0)
-        bufferFull = false;
-    for (int i = 0; i < BUFFER_GROUPS; ++i)
-        buffer[i*GROUP_SIZE+get_local_id(0)] = INVALID;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    // Loop over blocks sorted by size.
-    for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
-        if (get_local_id(0) == get_local_size(0)-1)
-            numAtoms = 0;
-        real2 sortedKey = sortedBlocks[i];
-        int x = (int) sortedKey.y;
-        real4 blockCenterX = sortedBlockCenter[i];
-        real4 blockSizeX = sortedBlockBoundingBox[i];
-        // Load exclusion data for block x.
-        const int exclusionStart = exclusionRowIndices[x];
+        if (neighborsInBuffer > 0) {
-        const int exclusionEnd = exclusionRowIndices[x+1];
+            int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
-        const int numExclusions = exclusionEnd-exclusionStart;
+            if (indexInWarp == 0)
-        for (int j = get_local_id(0); j < numExclusions; j += get_local_size(0))
+                *tileStartIndex = atom_add(interactionCount, tilesToStore);
-            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
+            SYNC_WARPS;
-        barrier(CLK_LOCAL_MEM_FENCE);
+            int newTileStartIndex = *tileStartIndex;
+            if (newTileStartIndex+tilesToStore < maxTiles) {
-        // Compare it to other blocks after this one in sorted order.
+                if (indexInWarp < tilesToStore)
+                    interactingTiles[newTileStartIndex+indexInWarp] = x;
-        for (int base = i+1; base < NUM_BLOCKS; base += get_local_size(0)) {
+                for (int j = 0; j < tilesToStore; j++)
-            int j = base+get_local_id(0);
+                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
-            real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : (real2) 0);
-            real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : (real4) 0);
-            real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : (real4) 0);
-            unsigned short y = (unsigned short) sortedKey2.y;
-            real4 delta = blockCenterX-blockCenterY;
-#ifdef USE_PERIODIC
-            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-            delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
-            delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
-            delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
-            bool hasExclusions = false;
-            for (int k = 0; k < numExclusions; k++)
-                hasExclusions |= (exclusionsForX[k] == y);
-            if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
-                // Add this tile to the buffer.
-                int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
-                buffer[bufferIndex] = y;
-                valuesInBuffer++;
-                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
-                    bufferFull = true;
            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (bufferFull) {
-                storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
-                valuesInBuffer = 0;
-                if (get_local_id(0) == 0)
-                    bufferFull = false;
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
        }
-        storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
    }
    // Record the positions the neighbor list is based on.