Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · f6d4557d · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
20 changed files
--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-#define TILE_SIZE 32
-#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
-typedef struct {
-    real x, y, z;
-    real q;
-    float radius, scaledRadius;
-    real bornSum;
-} AtomData1;
-/**
- * Compute the Born sum.
- */
-__kernel void computeBornSum(
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict global_bornSum,
-#else
-        __global real* restrict global_bornSum,
-#endif
-        __global const real4* restrict posq, __global const float2* restrict global_params,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
-#else
-        unsigned int numTiles,
-#endif
-        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices) {
-    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    unsigned int warp = get_global_id(0)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
-    unsigned int lasty = 0xFFFFFFFF;
-    __local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
-    __local real tempBuffer[FORCE_WORK_GROUP_SIZE];
-    __local int2 reservedBlocks[WARPS_PER_GROUP];
-    __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
-    __local int exclusionIndex[WARPS_PER_GROUP];
-    do {
-        // Extract the coordinates of this tile
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int x, y;
-        real bornSum = 0.0f;
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-            real4 posq1 = posq[atom1];
-            float2 params1 = global_params[atom1];
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-                localData[get_local_id(0)].x = posq1.x;
-                localData[get_local_id(0)].y = posq1.y;
-                localData[get_local_id(0)].z = posq1.z;
-                localData[get_local_id(0)].q = posq1.w;
-                localData[get_local_id(0)].radius = params1.x;
-                localData[get_local_id(0)].scaledRadius = params1.y;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
-#ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-#else
-                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
-                        float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
-                        real rScaledRadiusJ = r+params2.y;
-                        if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
-                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
-                            real u_ij = RECIP(rScaledRadiusJ);
-                            real l_ij2 = l_ij*l_ij;
-                            real u_ij2 = u_ij*u_ij;
-                            real ratio = LOG(u_ij * RECIP(l_ij));
-                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                            if (params1.x < params2.y-r)
-                                bornSum += 2.0f*(RECIP(params1.x)-l_ij);
-                        }
-                    }
-                }
-            }
-            else {
-                // This is an off-diagonal tile.
-                if (lasty != y) {
-                    unsigned int j = y*TILE_SIZE + tgx;
-                    real4 tempPosq = posq[j];
-                    localData[get_local_id(0)].x = tempPosq.x;
-                    localData[get_local_id(0)].y = tempPosq.y;
-                    localData[get_local_id(0)].z = tempPosq.z;
-                    localData[get_local_id(0)].q = tempPosq.w;
-                    float2 tempParams = global_params[j];
-                    localData[get_local_id(0)].radius = tempParams.x;
-                    localData[get_local_id(0)].scaledRadius = tempParams.y;
-                }
-                localData[get_local_id(0)].bornSum = 0.0f;
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                bool computeSubset = false;
-                if (flags != 0xFFFFFFFF) {
-                    if (tgx < 2)
-                        exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-                    if (tgx == 0)
-                        exclusionIndex[localGroupIndex] = -1;
-                    for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                        if (exclusionIndices[i] == y)
-                            exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-                    computeSubset = (exclusionIndex[localGroupIndex] == -1);
-                }
-                if (computeSubset) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
-#ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                                tempBuffer[get_local_id(0)] = 0.0f;
-#ifdef USE_CUTOFF
-                                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-#else
-                                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-#endif
-                                    real invR = RSQRT(r2);
-                                    real r = RECIP(invR);
-                                    float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
-                                    real rScaledRadiusJ = r+params2.y;
-                                    if (params1.x < rScaledRadiusJ) {
-                                        real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
-                                        real u_ij = RECIP(rScaledRadiusJ);
-                                        real l_ij2 = l_ij*l_ij;
-                                        real u_ij2 = u_ij*u_ij;
-                                        real ratio = LOG(u_ij * RECIP(l_ij));
-                                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                                         (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                                        if (params1.x < params2.y-r)
-                                            bornSum += 2.0f*(RECIP(params1.x)-l_ij);
-                                    }
-                                    real rScaledRadiusI = r+params1.y;
-                                    if (params2.x < rScaledRadiusI) {
-                                        real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
-                                        real u_ij = RECIP(rScaledRadiusI);
-                                        real l_ij2 = l_ij*l_ij;
-                                        real u_ij2 = u_ij*u_ij;
-                                        real ratio = LOG(u_ij * RECIP(l_ij));
-                                        real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                                         (params1.y*params1.y*invR)*(l_ij2-u_ij2));
-                                        if (params2.x < params1.y-r)
-                                            term += 2.0f*(RECIP(params2.x)-l_ij);
-                                        tempBuffer[get_local_id(0)] = term;
-                                    }
-                                }
-                                // Sum the forces on atom j.
-                                if (tgx % 4 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
-                                if (tgx == 0)
-                                    localData[tbx+j].bornSum += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
-                            }
-                        }
-                    }
-                }
-                else
-#endif
-                {
-                    // Compute the full set of interactions in this tile.
-                    unsigned int tj = tgx;
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
-#ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-#else
-                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-#endif
-                            real invR = RSQRT(r2);
-                            real r = RECIP(invR);
-                            float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
-                            real rScaledRadiusJ = r+params2.y;
-                            if (params1.x < rScaledRadiusJ) {
-                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
-                                real u_ij = RECIP(rScaledRadiusJ);
-                                real l_ij2 = l_ij*l_ij;
-                                real u_ij2 = u_ij*u_ij;
-                                real ratio = LOG(u_ij * RECIP(l_ij));
-                                bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                                 (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                                if (params1.x < params2.y-r)
-                                    bornSum += 2.0f*(RECIP(params1.x)-l_ij);
-                            }
-                            real rScaledRadiusI = r+params1.y;
-                            if (params2.x < rScaledRadiusI) {
-                                real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
-                                real u_ij = RECIP(rScaledRadiusI);
-                                real l_ij2 = l_ij*l_ij;
-                                real u_ij2 = u_ij*u_ij;
-                                real ratio = LOG(u_ij * RECIP(l_ij));
-                                real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                                 (params1.y*params1.y*invR)*(l_ij2-u_ij2));
-                                if (params2.x < params1.y-r)
-                                    term += 2.0f*(RECIP(params2.x)-l_ij);
-                                localData[tbx+tj].bornSum += term;
-                            }
-                        }
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                }
-            }
-        }
-        // Write results.  We need to coordinate between warps to make sure no two of them
-        // ever try to write to the same piece of memory at the same time.
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
-            atom_add(&global_bornSum[offset], (long) (bornSum*0x100000000));
-        }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
-            atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0x100000000));
-        }
-#else
-        int writeX = (pos < end ? x : -1);
-        int writeY = (pos < end && x != y ? y : -1);
-        if (tgx == 0)
-            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
-        bool done = false;
-        int doneIndex = 0;
-        int checkIndex = 0;
-        while (true) {
-            // See if any warp still needs to write its data.
-            bool allDone = true;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            while (doneIndex < WARPS_PER_GROUP && allDone) {
-                if (reservedBlocks[doneIndex].x != -1)
-                    allDone = false;
-                else
-                    doneIndex++;
-            }
-            if (allDone)
-                break;
-            if (!done) {
-                // See whether this warp can write its data.  This requires that no previous warp
-                // is trying to write to the same block of the buffer.
-                bool canWrite = (writeX != -1);
-                while (checkIndex < localGroupIndex && canWrite) {
-                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
-                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
-                        canWrite = false;
-                    else
-                        checkIndex++;
-                }
-                if (canWrite) {
-                    // Write the data to global memory, then mark this warp as done.
-                    if (writeX > -1) {
-                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        global_bornSum[offset] += bornSum;
-                    }
-                    if (writeY > -1) {
-                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        global_bornSum[offset] += localData[get_local_id(0)].bornSum;
-                    }
-                    done = true;
-                    if (tgx == 0)
-                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
-                }
-            }
-        }
-#endif
-        lasty = y;
-        pos++;
-    } while (pos < end);
-}
-typedef struct {
-    real x, y, z;
-    real q;
-    real fx, fy, fz, fw;
-    real bornRadius;
-} AtomData2;
-/**
- * First part of computing the GBSA interaction.
- */
-__kernel void computeGBSAForce1(
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers, __global long* restrict global_bornForce,
-#else
-        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
-#endif
-        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
-#else
-        unsigned int numTiles,
-#endif
-        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices) {
-    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    unsigned int warp = get_global_id(0)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
-    real energy = 0.0f;
-    unsigned int lasty = 0xFFFFFFFF;
-    __local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
-    __local real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
-    __local int2 reservedBlocks[WARPS_PER_GROUP];
-    __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
-    __local int exclusionIndex[WARPS_PER_GROUP];
-    do {
-        // Extract the coordinates of this tile
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int x, y;
-        real4 force = 0.0f;
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-            real4 posq1 = posq[atom1];
-            real bornRadius1 = global_bornRadii[atom1];
-            if (x == y) {
-                // This tile is on the diagonal.
-                localData[get_local_id(0)].x = posq1.x;
-                localData[get_local_id(0)].y = posq1.y;
-                localData[get_local_id(0)].z = posq1.z;
-                localData[get_local_id(0)].q = posq1.w;
-                localData[get_local_id(0)].bornRadius = bornRadius1;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-                        real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
-                        real bornRadius2 = localData[tbx+j].bornRadius;
-                        real alpha2_ij = bornRadius1*bornRadius2;
-                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                        real expTerm = EXP(-D_ij);
-                        real denominator2 = r2 + alpha2_ij*expTerm;
-                        real denominator = SQRT(denominator2);
-                        real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
-                        real Gpol = tempEnergy*RECIP(denominator2);
-                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-                        force.w += dGpol_dalpha2_ij*bornRadius2;
-                        energy += 0.5f*tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
-#ifdef USE_CUTOFF
-                        }
-#endif
-                    }
-                }
-            }
-            else {
-                // This is an off-diagonal tile.
-                if (lasty != y) {
-                    unsigned int j = y*TILE_SIZE + tgx;
-                    real4 tempPosq = posq[j];
-                    localData[get_local_id(0)].x = tempPosq.x;
-                    localData[get_local_id(0)].y = tempPosq.y;
-                    localData[get_local_id(0)].z = tempPosq.z;
-                    localData[get_local_id(0)].q = tempPosq.w;
-                    localData[get_local_id(0)].bornRadius = global_bornRadii[j];
-                }
-                localData[get_local_id(0)].fx = 0.0f;
-                localData[get_local_id(0)].fy = 0.0f;
-                localData[get_local_id(0)].fz = 0.0f;
-                localData[get_local_id(0)].fw = 0.0f;
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                bool computeSubset = false;
-#ifdef USE_APPLE_WORKAROUND
-                computeSubset = (flags == 0); // Workaround for a compiler bug in Apple's OpenCL on Lion
-#else
-                if (flags != 0xFFFFFFFF) {
-                    if (tgx < 2)
-                        exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-                    if (tgx == 0)
-                        exclusionIndex[localGroupIndex] = -1;
-                    for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                        if (exclusionIndices[i] == y)
-                            exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-                    computeSubset = (exclusionIndex[localGroupIndex] == -1);
-                }
-#endif
-                if (computeSubset) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
-                                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                                if (r2 < CUTOFF_SQUARED) {
-#endif
-                                real invR = RSQRT(r2);
-                                real r = RECIP(invR);
-                                real bornRadius2 = localData[tbx+j].bornRadius;
-                                real alpha2_ij = bornRadius1*bornRadius2;
-                                real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                                real expTerm = EXP(-D_ij);
-                                real denominator2 = r2 + alpha2_ij*expTerm;
-                                real denominator = SQRT(denominator2);
-                                real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
-                                real Gpol = tempEnergy*RECIP(denominator2);
-                                real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                                real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-#ifdef USE_CUTOFF
-                                if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
-#else
-                                if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
-#endif
-                                    dEdR = 0.0f;
-                                    dGpol_dalpha2_ij = 0.0f;
-                                    tempEnergy = 0.0f;
-                                }
-                                energy += tempEnergy;
-                                force.w += dGpol_dalpha2_ij*bornRadius2;
-                                delta.xyz *= dEdR;
-                                force.xyz -= delta.xyz;
-                                tempBuffer[get_local_id(0)] = (real4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
-#ifdef USE_CUTOFF
-                                }
-                                else
-                                    tempBuffer[get_local_id(0)] = (real4) 0;
-#endif
-                                // Sum the forces on atom j.
-                                if (tgx % 4 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
-                                if (tgx == 0) {
-                                    real4 sum = tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
-                                    localData[tbx+j].fx += sum.x;
-                                    localData[tbx+j].fy += sum.y;
-                                    localData[tbx+j].fz += sum.z;
-                                    localData[tbx+j].fw += sum.w;
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-#endif
-                {
-                    // Compute the full set of interactions in this tile.
-                    unsigned int tj = tgx;
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-                            real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
-                            real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                            if (r2 < CUTOFF_SQUARED) {
-#endif
-                            real invR = RSQRT(r2);
-                            real r = RECIP(invR);
-                            real bornRadius2 = localData[tbx+tj].bornRadius;
-                            real alpha2_ij = bornRadius1*bornRadius2;
-                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                            real expTerm = EXP(-D_ij);
-                            real denominator2 = r2 + alpha2_ij*expTerm;
-                            real denominator = SQRT(denominator2);
-                            real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
-                            real Gpol = tempEnergy*RECIP(denominator2);
-                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-                            force.w += dGpol_dalpha2_ij*bornRadius2;
-                            energy += tempEnergy;
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
-                            localData[tbx+tj].fx += delta.x;
-                            localData[tbx+tj].fy += delta.y;
-                            localData[tbx+tj].fz += delta.z;
-                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
-#ifdef USE_CUTOFF
-                            }
-#endif
-                        }
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                }
-            }
-        }
-        // Write results.  We need to coordinate between warps to make sure no two of them
-        // ever try to write to the same piece of memory at the same time.
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
-            atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-            atom_add(&global_bornForce[offset], (long) (force.w*0x100000000));
-        }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
-            atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
-            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
-            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
-            atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000));
-        }
-#else
-        int writeX = (pos < end ? x : -1);
-        int writeY = (pos < end && x != y ? y : -1);
-        if (tgx == 0)
-            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
-        bool done = false;
-        int doneIndex = 0;
-        int checkIndex = 0;
-        while (true) {
-            // See if any warp still needs to write its data.
-            bool allDone = true;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            while (doneIndex < WARPS_PER_GROUP && allDone) {
-                if (reservedBlocks[doneIndex].x != -1)
-                    allDone = false;
-                else
-                    doneIndex++;
-            }
-            if (allDone)
-                break;
-            if (!done) {
-                // See whether this warp can write its data.  This requires that no previous warp
-                // is trying to write to the same block of the buffer.
-                bool canWrite = (writeX != -1);
-                while (checkIndex < localGroupIndex && canWrite) {
-                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
-                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
-                        canWrite = false;
-                    else
-                        checkIndex++;
-                }
-                if (canWrite) {
-                    // Write the data to global memory, then mark this warp as done.
-                    if (writeX > -1) {
-                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        forceBuffers[offset].xyz += force.xyz;
-                        global_bornForce[offset] += force.w;
-                    }
-                    if (writeY > -1) {
-                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        forceBuffers[offset] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
-                        global_bornForce[offset] += localData[get_local_id(0)].fw;
-                    }
-                    done = true;
-                    if (tgx == 0)
-                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
-                }
-            }
-        }
-#endif
-        lasty = y;
-        pos++;
-    } while (pos < end);
-    energyBuffer[get_global_id(0)] += energy;
-}
--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
@@ -15,7 +15,7 @@ __kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global co
    while (index < NUM_ATOMS) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
-            mixed sqrtInvMass = sqrt(velocity.w);
+            mixed sqrtInvMass = SQRT(velocity.w);
            velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
@@ -96,8 +96,8 @@ __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed ta
    if (get_global_id(0) == 0) {
        // Select the new step size.
-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
+        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
@@ -109,9 +109,9 @@ __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed ta
        // Recalculate the integration parameters.
-        mixed vscale = exp(-newStepSize/tau);
+        mixed vscale = EXP(-newStepSize/tau);
        mixed fscale = (1-vscale)*tau;
-        mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
+        mixed noisescale = SQRT(2*kT/tau)*SQRT(0.5f*(1-vscale*vscale)*tau);
        params[VelScale] = vscale;
        params[ForceScale] = fscale;
        params[NoiseScale] = noisescale;

--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
+#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
+typedef struct {
+    real x, y, z;
+    real q;
+    real fx, fy, fz;
+    ATOM_PARAMETER_DATA
+#ifndef PARAMETER_SIZE_IS_EVEN
+    real padding;
+#endif
+} AtomData;
+/**
+ * Compute nonbonded interactions.
+ */
+__kernel void computeNonbonded(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict forceBuffers,
+#else
+        __global real4* restrict forceBuffers,
+#endif
+        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
+        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
+#ifdef USE_CUTOFF
+        , __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms
+#endif
+        PARAMETER_ARGUMENTS) {
+    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    const unsigned int warp = get_global_id(0)/TILE_SIZE;
+    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+    const unsigned int tbx = get_local_id(0) - tgx;
+    real energy = 0;
+    __local AtomData localData[FORCE_WORK_GROUP_SIZE];
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        real4 force = 0;
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
+#endif
+        const bool hasExclusions = true;
+        if (x == y) {
+            // This tile is on the diagonal.
+            const unsigned int localAtomIndex = get_local_id(0);
+            localData[localAtomIndex].x = posq1.x;
+            localData[localAtomIndex].y = posq1.y;
+            localData[localAtomIndex].z = posq1.z;
+            localData[localAtomIndex].q = posq1.w;
+            LOAD_LOCAL_PARAMETERS_FROM_1
+            SYNC_WARPS;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+j;
+                real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                real invR = RSQRT(r2);
+                real r = RECIP(invR);
+                LOAD_ATOM2_PARAMETERS
+                atom2 = y*TILE_SIZE+j;
+#ifdef USE_SYMMETRIC
+                real dEdR = 0;
+#else
+                real4 dEdR1 = (real4) 0;
+                real4 dEdR2 = (real4) 0;
+#endif
+#ifdef USE_EXCLUSIONS
+                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+#endif
+                real tempEnergy = 0;
+                COMPUTE_INTERACTION
+                energy += 0.5f*tempEnergy;
+#ifdef USE_SYMMETRIC
+                force.xyz -= delta.xyz*dEdR;
+#else
+                force.xyz -= dEdR1.xyz;
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                SYNC_WARPS;
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            const unsigned int localAtomIndex = get_local_id(0);
+            unsigned int j = y*TILE_SIZE + tgx;
+            real4 tempPosq = posq[j];
+            localData[localAtomIndex].x = tempPosq.x;
+            localData[localAtomIndex].y = tempPosq.y;
+            localData[localAtomIndex].z = tempPosq.z;
+            localData[localAtomIndex].q = tempPosq.w;
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            localData[localAtomIndex].fx = 0;
+            localData[localAtomIndex].fy = 0;
+            localData[localAtomIndex].fz = 0;
+            SYNC_WARPS;
+#ifdef USE_EXCLUSIONS
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+tj;
+                real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+tj;
+#ifdef USE_SYMMETRIC
+                    real dEdR = 0;
+#else
+                    real4 dEdR1 = (real4) 0;
+                    real4 dEdR2 = (real4) 0;
+#endif
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+#endif
+                    real tempEnergy = 0;
+                    COMPUTE_INTERACTION
+                    energy += tempEnergy;
+#ifdef USE_SYMMETRIC
+                    delta.xyz *= dEdR;
+                    force.xyz -= delta.xyz;
+                    localData[tbx+tj].fx += delta.x;
+                    localData[tbx+tj].fy += delta.y;
+                    localData[tbx+tj].fz += delta.z;
+#else
+                    force.xyz -= dEdR1.xyz;
+                    localData[tbx+tj].fx += dEdR2.x;
+                    localData[tbx+tj].fy += dEdR2.y;
+                    localData[tbx+tj].fz += dEdR2.z;
+#endif
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                tj = (tj + 1) & (TILE_SIZE - 1);
+                SYNC_WARPS;
+            }
+        }
+        // Write results.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        unsigned int offset = x*TILE_SIZE + tgx;
+        atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
+        atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+        atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
+        }
+#else
+        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        forceBuffers[offset1].xyz += force.xyz;
+        if (x != y)
+            forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+#endif
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+#else
+    const unsigned int numTiles = numTileIndices;
+    int pos = startTileIndex+warp*numTiles/totalWarps;
+    int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
+    __local int skipTiles[FORCE_WORK_GROUP_SIZE];
+    skipTiles[get_local_id(0)] = -1;
+    while (pos < end) {
+        const bool hasExclusions = false;
+        real4 force = 0;
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            SYNC_WARPS;
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                SYNC_WARPS;
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[get_local_id(0)] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+                SYNC_WARPS;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = get_local_id(0);
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[get_local_id(0)] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                real4 tempPosq = posq[j];
+                localData[localAtomIndex].x = tempPosq.x;
+                localData[localAtomIndex].y = tempPosq.y;
+                localData[localAtomIndex].z = tempPosq.z;
+                localData[localAtomIndex].q = tempPosq.w;
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                localData[localAtomIndex].fx = 0;
+                localData[localAtomIndex].fy = 0;
+                localData[localAtomIndex].fz = 0;
+            }
+            SYNC_WARPS;
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                SYNC_WARPS;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    if (r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+#ifdef USE_SYMMETRIC
+                        real dEdR = 0;
+#else
+                        real4 dEdR1 = (real4) 0;
+                        real4 dEdR2 = (real4) 0;
+#endif
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
+#endif
+                        real tempEnergy = 0;
+                        COMPUTE_INTERACTION
+                        energy += tempEnergy;
+#ifdef USE_SYMMETRIC
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        localData[tbx+tj].fx += delta.x;
+                        localData[tbx+tj].fy += delta.y;
+                        localData[tbx+tj].fz += delta.z;
+#else
+                        force.xyz -= dEdR1.xyz;
+                        localData[tbx+tj].fx += dEdR2.x;
+                        localData[tbx+tj].fy += dEdR2.y;
+                        localData[tbx+tj].fz += dEdR2.z;
+#endif
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+#ifdef USE_SYMMETRIC
+                        real dEdR = 0;
+#else
+                        real4 dEdR1 = (real4) 0;
+                        real4 dEdR2 = (real4) 0;
+#endif
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
+#endif
+                        real tempEnergy = 0;
+                        COMPUTE_INTERACTION
+                        energy += tempEnergy;
+#ifdef USE_SYMMETRIC
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        localData[tbx+tj].fx += delta.x;
+                        localData[tbx+tj].fy += delta.y;
+                        localData[tbx+tj].fz += delta.z;
+#else
+                        force.xyz -= dEdR1.xyz;
+                        localData[tbx+tj].fx += dEdR2.x;
+                        localData[tbx+tj].fy += dEdR2.y;
+                        localData[tbx+tj].fz += dEdR2.z;
+#endif
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            // Write results.
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[get_local_id(0)];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+            atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
+                atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
+                atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
+            }
+#else
+            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
+            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
+            forceBuffers[offset1].xyz += force.xyz;
+            if (atom2 < PADDED_NUM_ATOMS)
+                forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+#endif
+        }
+        pos++;
+    }
+    energyBuffer[get_global_id(0)] += energy;
+}
--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
-#define TILE_SIZE 32
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
 typedef struct {
    real x, y, z;
@@ -11,89 +13,54 @@ typedef struct {
 * Compute nonbonded interactions.
 */
-__kernel void computeNonbonded(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
+__kernel void computeNonbonded(
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
+#ifdef SUPPORTS_64_BIT_ATOMICS
-        unsigned int startTileIndex, unsigned int endTileIndex,
+        __global long* restrict forceBuffers,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
 #else
-        unsigned int numTiles
+        __global real4* restrict forceBuffers,
 #endif
-        PARAMETER_ARGUMENTS) {
+        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
+        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
+        , __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms
-    unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
-    unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
-#else
-    unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
-    unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
+        PARAMETER_ARGUMENTS) {
    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
    __local AtomData localData[TILE_SIZE];
-    while (pos < end) {
+    // First loop: process tiles that contain exclusions.
-        // Extract the coordinates of this tile
-        unsigned int x, y;
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-#ifdef USE_CUTOFF
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-        if (numTiles <= maxTiles) {
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-            ushort2 tileIndices = tiles[pos];
+        const ushort2 tileIndices = exclusionTiles[pos];
-            x = tileIndices.x;
+        const unsigned int x = tileIndices.x;
-            y = tileIndices.y;
+        const unsigned int y = tileIndices.y;
-        }
-        else
-#endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
-        // Locate the exclusion data for this tile.
-#ifdef USE_EXCLUSIONS
+        // Load the data for this tile.
-        unsigned int exclusionStart = exclusionRowIndices[x];
-        unsigned int exclusionEnd = exclusionRowIndices[x+1];
-        int exclusionIndex = -1;
-        for (int i = exclusionStart; i < exclusionEnd; i++)
-            if (exclusionIndices[i] == y) {
-                exclusionIndex = i*TILE_SIZE;
-                break;
-            }
-        bool hasExclusions = (exclusionIndex > -1);
-#endif
-        // Load the data for this tile if we don't already have it cached.
-        if (lasty != y) {
+        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
-            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+            unsigned int j = y*TILE_SIZE + localAtomIndex;
-                unsigned int j = y*TILE_SIZE + localAtomIndex;
+            real4 tempPosq = posq[j];
-                real4 tempPosq = posq[j];
+            localData[localAtomIndex].x = tempPosq.x;
-                localData[localAtomIndex].x = tempPosq.x;
+            localData[localAtomIndex].y = tempPosq.y;
-                localData[localAtomIndex].y = tempPosq.y;
+            localData[localAtomIndex].z = tempPosq.z;
-                localData[localAtomIndex].z = tempPosq.z;
+            localData[localAtomIndex].q = tempPosq.w;
-                localData[localAtomIndex].q = tempPosq.w;
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            }
        }
+        const bool hasExclusions = true;
        if (x == y) {
            // This tile is on the diagonal.
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
 #ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex+tgx];
+                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
 #endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real4 force = 0;
                real4 posq1 = posq[atom1];
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
@@ -103,35 +70,46 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r
 #ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
 #endif
-                    real invR = RSQRT(r2);
+                        real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                        real r = RECIP(invR);
-                    unsigned int atom2 = j;
+                        unsigned int atom2 = j;
-                    LOAD_ATOM2_PARAMETERS
+                        LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
+                        atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
-                    real dEdR = 0;
+                        real dEdR = 0;
 #else
-                    real4 dEdR1 = (real4) 0;
+                        real4 dEdR1 = (real4) 0;
-                    real4 dEdR2 = (real4) 0;
+                        real4 dEdR2 = (real4) 0;
 #endif
-                    real tempEnergy = 0;
+#ifdef USE_EXCLUSIONS
-                    COMPUTE_INTERACTION
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
-                    energy += 0.5f*tempEnergy;
+#endif
+                        real tempEnergy = 0;
+                        COMPUTE_INTERACTION
+                        energy += 0.5f*tempEnergy;
 #ifdef USE_SYMMETRIC
-                    force.xyz -= delta.xyz*dEdR;
+                        force.xyz -= delta.xyz*dEdR;
 #else
-                    force.xyz -= dEdR1.xyz;
+                        force.xyz -= dEdR1.xyz;
 #endif
 #ifdef USE_CUTOFF
                    }
 #endif
+#ifdef USE_EXCLUSIONS
                    excl >>= 1;
+#endif
                }
                // Write results.
-                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+                atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+#else
+                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+#endif
            }
        }
        else {
@@ -142,82 +120,244 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r
                localData[tgx].fy = 0;
                localData[tgx].fz = 0;
            }
+            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                unsigned int atom1 = x*TILE_SIZE+tgx;
+                real4 force = 0;
+                real4 posq1 = posq[atom1];
+                LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
+#endif
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-            unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
+                    if (r2 < CUTOFF_SQUARED) {
-            unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
+#endif
-            if (!hasExclusions && (flags1 != 0xFFFFFFFF || flags2 != 0xFFFFFFFF)) {
+                        real invR = RSQRT(r2);
-                // Compute only a subset of the interactions in this tile.
+                        real r = RECIP(invR);
+                        unsigned int atom2 = j;
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+j;
+#ifdef USE_SYMMETRIC
+                        real dEdR = 0;
+#else
+                        real4 dEdR1 = (real4) 0;
+                        real4 dEdR2 = (real4) 0;
+#endif
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+#endif
+                        real tempEnergy = 0;
+                        COMPUTE_INTERACTION
+                        energy += tempEnergy;
+#ifdef USE_SYMMETRIC
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        localData[j].fx += delta.x;
+                        localData[j].fy += delta.y;
+                        localData[j].fz += delta.z;
+#else
+                        force.xyz -= dEdR1.xyz;
+                        localData[j].fx += dEdR2.x;
+                        localData[j].fy += dEdR2.y;
+                        localData[j].fz += dEdR2.z;
+#endif
+#ifdef USE_CUTOFF
+                    }
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
-                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+               // Write results for atom1.
-                    if ((flags2&(1<<tgx)) != 0) {
-                        unsigned int atom1 = x*TILE_SIZE+tgx;
+#ifdef SUPPORTS_64_BIT_ATOMICS
-                        real4 force = 0;
+                atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-                        real4 posq1 = posq[atom1];
+                atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                        LOAD_ATOM1_PARAMETERS
+                atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#else
-                            if ((flags1&(1<<j)) != 0) {
+                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                                bool isExcluded = false;
+                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
-                                real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+#endif
-                                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+            }
+            // Write results.
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                unsigned int offset = y*TILE_SIZE + tgx;
+                atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000));
+                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
+                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
+#else
+                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                real4 f = forceBuffers[offset];
+                f.x += localData[tgx].fx;
+                f.y += localData[tgx].fy;
+                f.z += localData[tgx].fz;
+                forceBuffers[offset] = f;
+#endif
+            }
+        }
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    const unsigned int numTiles = interactionCount[0];
+    int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*numTileIndices/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
+    int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*numTileIndices/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
+#else
+    const unsigned int numTiles = numTileIndices;
+    int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
+    int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
+#endif
+    int nextToSkip = -1;
+    int currentSkipIndex = 0;
+    __local int atomIndices[TILE_SIZE];
+    while (pos < end) {
+        const bool hasExclusions = false;
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (nextToSkip < pos) {
+                if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[currentSkipIndex++];
+                    nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    nextToSkip = end;
+            }
+            includeTile = (nextToSkip != pos);
+        }
+        if (includeTile) {
+            // Load the data for this tile.
+            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+#ifdef USE_CUTOFF
+                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+#else
+                unsigned int j = y*TILE_SIZE+localAtomIndex;
+#endif
+                atomIndices[localAtomIndex] = j;
+                if (j < PADDED_NUM_ATOMS) {
+                    real4 tempPosq = posq[j];
+                    localData[localAtomIndex].x = tempPosq.x;
+                    localData[localAtomIndex].y = tempPosq.y;
+                    localData[localAtomIndex].z = tempPosq.z;
+                    localData[localAtomIndex].q = tempPosq.w;
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                    localData[localAtomIndex].fx = 0;
+                    localData[localAtomIndex].fy = 0;
+                    localData[localAtomIndex].fz = 0;
+                }
+            }
 #ifdef USE_PERIODIC
-                                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+            if (singlePeriodicCopy) {
-#endif
+                // The box is small enough that we can just translate all the atoms into a single periodic
-                                real r2 = dot(delta.xyz, delta.xyz);
+                // box, then skip having to apply periodic boundary conditions later.
-                                if (r2 < CUTOFF_SQUARED) {
-                                    real invR = RSQRT(r2);
+                real4 blockCenterX = blockCenter[x];
-                                    real r = RECIP(invR);
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                                    unsigned int atom2 = j;
+                    localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                    LOAD_ATOM2_PARAMETERS
+                    localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                    atom2 = y*TILE_SIZE+j;
+                    localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                }
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
+                    real4 force = 0;
+                    real4 posq1 = posq[atom1];
+                    posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    LOAD_ATOM1_PARAMETERS
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real r2 = dot(delta.xyz, delta.xyz);
+                        if (r2 < CUTOFF_SQUARED) {
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            unsigned int atom2 = j;
+                            LOAD_ATOM2_PARAMETERS
+                            atom2 = atomIndices[j];
 #ifdef USE_SYMMETRIC
-                                    real dEdR = 0;
+                            real dEdR = 0;
 #else
-                                    real4 dEdR1 = (real4) 0;
+                            real4 dEdR1 = (real4) 0;
-                                    real4 dEdR2 = (real4) 0;
+                            real4 dEdR2 = (real4) 0;
+#endif
+#ifdef USE_EXCLUSIONS
+                            bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
 #endif
-                                    real tempEnergy = 0;
+                            real tempEnergy = 0;
-                                    COMPUTE_INTERACTION
+                            COMPUTE_INTERACTION
-                                    energy += tempEnergy;
+                            energy += tempEnergy;
 #ifdef USE_SYMMETRIC
-                                    delta.xyz *= dEdR;
+                            delta.xyz *= dEdR;
-                                    force.xyz -= delta.xyz;
+                            force.xyz -= delta.xyz;
-                                    localData[j].fx += delta.x;
+                            localData[j].fx += delta.x;
-                                    localData[j].fy += delta.y;
+                            localData[j].fy += delta.y;
-                                    localData[j].fz += delta.z;
+                            localData[j].fz += delta.z;
 #else
-                                    force.xyz -= dEdR1.xyz;
+                            force.xyz -= dEdR1.xyz;
-                                    localData[j].fx += dEdR2.x;
+                            localData[j].fx += dEdR2.x;
-                                    localData[j].fy += dEdR2.y;
+                            localData[j].fy += dEdR2.y;
-                                    localData[j].fz += dEdR2.z;
+                            localData[j].fz += dEdR2.z;
 #endif
-                                }
-                            }
                        }
+                    }
-                        // Write results for atom1.
+                   // Write results for atom1.
-                        unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef SUPPORTS_64_BIT_ATOMICS
-                        forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+                    atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-                    }
+                    atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                    atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+#else
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+#endif
                }
            }
            else
 #endif
            {
-                // Compute the full set of interactions in this tile.
+                // We need to apply periodic boundary conditions separately for each interaction.
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    real4 posq1 = posq[atom1];
                    LOAD_ATOM1_PARAMETERS
-#ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
-#endif
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
@@ -227,59 +367,77 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r
 #ifdef USE_CUTOFF
                        if (r2 < CUTOFF_SQUARED) {
 #endif
-                        real invR = RSQRT(r2);
+                            real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                            real r = RECIP(invR);
-                        unsigned int atom2 = j;
+                            unsigned int atom2 = j;
-                        LOAD_ATOM2_PARAMETERS
+                            LOAD_ATOM2_PARAMETERS
-                        atom2 = y*TILE_SIZE+j;
+                            atom2 = atomIndices[j];
 #ifdef USE_SYMMETRIC
-                        real dEdR = 0;
+                            real dEdR = 0;
 #else
-                        real4 dEdR1 = (real4) 0;
+                            real4 dEdR1 = (real4) 0;
-                        real4 dEdR2 = (real4) 0;
+                            real4 dEdR2 = (real4) 0;
 #endif
-                        real tempEnergy = 0;
+#ifdef USE_EXCLUSIONS
-                        COMPUTE_INTERACTION
+                            bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
-                        energy += tempEnergy;
+#endif
+                            real tempEnergy = 0;
+                            COMPUTE_INTERACTION
+                            energy += tempEnergy;
 #ifdef USE_SYMMETRIC
-                        delta.xyz *= dEdR;
+                            delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                            force.xyz -= delta.xyz;
-                        localData[j].fx += delta.x;
+                            localData[j].fx += delta.x;
-                        localData[j].fy += delta.y;
+                            localData[j].fy += delta.y;
-                        localData[j].fz += delta.z;
+                            localData[j].fz += delta.z;
 #else
-                        force.xyz -= dEdR1.xyz;
+                            force.xyz -= dEdR1.xyz;
-                        localData[j].fx += dEdR2.x;
+                            localData[j].fx += dEdR2.x;
-                        localData[j].fy += dEdR2.y;
+                            localData[j].fy += dEdR2.y;
-                        localData[j].fz += dEdR2.z;
+                            localData[j].fz += dEdR2.z;
 #endif
 #ifdef USE_CUTOFF
                        }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
 #endif
                    }
-                   // Write results for atom1.
+                    // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+                    atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                    atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+#else
                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                    forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+#endif
                }
            }
            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef USE_CUTOFF
-                real4 f = forceBuffers[offset];
+                unsigned int atom2 = atomIndices[tgx];
-                f.x += localData[tgx].fx;
+#else
-                f.y += localData[tgx].fy;
+                unsigned int atom2 = y*TILE_SIZE + tgx;
-                f.z += localData[tgx].fz;
+#endif
-                forceBuffers[offset] = f;
+                if (atom2 < PADDED_NUM_ATOMS) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&forceBuffers[atom2], (long) (localData[tgx].fx*0x100000000));
+                    atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
+                    atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
+#else
+                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    real4 f = forceBuffers[offset];
+                    f.x += localData[tgx].fx;
+                    f.y += localData[tgx].fy;
+                    f.z += localData[tgx].fz;
+                    forceBuffers[offset] = f;
+#endif
+                }
            }
        }
-        lasty = y;
        pos++;
    }
    energyBuffer[get_global_id(0)] += energy;

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-#define TILE_SIZE 32
-// Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would
-// cause every element of array to have DWORD of padding to make it 4 DWORD
-// aligned which wastes space and causes LDS bank conflicts as stride is no
-// longer odd DWORDS.
-typedef struct {
-    real x, y, z;
-} UnalignedReal3;
-typedef struct {
-    real x, y, z;
-    real q;
-    real fx, fy, fz;
-    ATOM_PARAMETER_DATA
-#ifndef PARAMETER_SIZE_IS_EVEN
-    real padding;
-#endif
-} AtomData;
-/**
- * Compute nonbonded interactions.
- */
-__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
-void computeNonbonded(
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers,
-#else
-        __global real4* restrict forceBuffers,
-#endif
-        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
-        unsigned int startTileIndex, unsigned int endTileIndex,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
-#else
-        unsigned int numTiles
-#endif
-        PARAMETER_ARGUMENTS) {
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
-    unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
-#else
-    unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
-    unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
-#endif
-    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
-    __local AtomData localData[TILE_SIZE];
-    __local UnalignedReal3 localForce[FORCE_WORK_GROUP_SIZE];
-#ifdef USE_EXCLUSIONS
-    __local unsigned int exclusionRange[2];
-    __local int exclusionIndex[1];
-#endif
-    while (pos < end) {
-        // Extract the coordinates of this tile
-        unsigned int x, y;
-#ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
-#endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
-        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
-        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 force = 0;
-        real4 posq1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
-        // Locate the exclusion data for this tile.
-#ifdef USE_EXCLUSIONS
-        if (get_local_id(0) < 2)
-            exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
-        if (get_local_id(0) == 0)
-            exclusionIndex[0] = -1;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for (int i = exclusionRange[0]+get_local_id(0); i < exclusionRange[1]; i += FORCE_WORK_GROUP_SIZE)
-            if (exclusionIndices[i] == y)
-                exclusionIndex[0] = i*TILE_SIZE;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        bool hasExclusions = (exclusionIndex[0] > -1);
-#endif
-        if (x == y) {
-            // This tile is on the diagonal.
-            if (get_local_id(0) < TILE_SIZE) {
-                const unsigned int localAtomIndex = tgx;
-                localData[localAtomIndex].x = posq1.x;
-                localData[localAtomIndex].y = posq1.y;
-                localData[localAtomIndex].z = posq1.z;
-                localData[localAtomIndex].q = posq1.w;
-                LOAD_LOCAL_PARAMETERS_FROM_1
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef USE_EXCLUSIONS
-            unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
-#endif
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-#ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
-#endif
-                unsigned int atom2 = baseLocalAtom+j;
-                real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+baseLocalAtom+j;
-#ifdef USE_SYMMETRIC
-                real dEdR = 0;
-#else
-                real4 dEdR1 = (real4) 0;
-                real4 dEdR2 = (real4) 0;
-#endif
-                real tempEnergy = 0;
-                COMPUTE_INTERACTION
-                energy += 0.5f*tempEnergy;
-#ifdef USE_SYMMETRIC
-                force.xyz -= delta.xyz*dEdR;
-#else
-                force.xyz -= dEdR1.xyz;
-#endif
-                excl >>= 1;
-            }
-            // Sum the forces and write results.
-            if (get_local_id(0) >= TILE_SIZE) {
-                localData[tgx].fx = force.x;
-                localData[tgx].fy = force.y;
-                localData[tgx].fz = force.z;
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset = x*TILE_SIZE + tgx;
-                atom_add(&forceBuffers[offset], (long) ((force.x + localData[tgx].fx)*0x100000000));
-                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0x100000000));
-                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0x100000000));
-#else
-                force.x += localData[tgx].fx;
-                force.y += localData[tgx].fy;
-                force.z += localData[tgx].fz;
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                // Cheaper to load/store real4 than real3.
-                real4 sum = forceBuffers[offset];
-                sum.xyz += force.xyz;
-                forceBuffers[offset] = sum;
-#endif
-            }
-            // barrier not required here as localData[*].temp is not accessed before encountering another barrier.
-        }
-        else {
-            // This is an off-diagonal tile.
-            if (lasty != y && get_local_id(0) < TILE_SIZE) {
-                const unsigned int localAtomIndex = tgx;
-                unsigned int j = y*TILE_SIZE + tgx;
-                real4 tempPosq = posq[j];
-                localData[localAtomIndex].x = tempPosq.x;
-                localData[localAtomIndex].y = tempPosq.y;
-                localData[localAtomIndex].z = tempPosq.z;
-                localData[localAtomIndex].q = tempPosq.w;
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            }
-            localForce[get_local_id(0)].x = 0;
-            localForce[get_local_id(0)].y = 0;
-            localForce[get_local_id(0)].z = 0;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            // Compute the full set of interactions in this tile.
-            unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
-#ifdef USE_EXCLUSIONS
-            unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
-            excl = (excl >> tj) | (excl << (TILE_SIZE - tj));
-#endif
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-#ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
-#endif
-                real4 posq2 = (real4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                int atom2 = tj;
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+tj;
-#ifdef USE_SYMMETRIC
-                real dEdR = 0;
-#else
-                real4 dEdR1 = (real4) 0;
-                real4 dEdR2 = (real4) 0;
-#endif
-                real tempEnergy = 0;
-                COMPUTE_INTERACTION
-                energy += tempEnergy;
-#ifdef USE_SYMMETRIC
-                delta.xyz *= dEdR;
-                force.xyz -= delta.xyz;
-                localForce[tj+localForceOffset].x += delta.x;
-                localForce[tj+localForceOffset].y += delta.y;
-                localForce[tj+localForceOffset].z += delta.z;
-#else
-                force.xyz -= dEdR1.xyz;
-                localForce[tj+localForceOffset].x += dEdR2.x;
-                localForce[tj+localForceOffset].y += dEdR2.y;
-                localForce[tj+localForceOffset].z += dEdR2.z;
-#endif
-                barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-                tj = (tj+1) & (TILE_SIZE-1);
-            }
-            // Sum the forces and write results.
-            if (get_local_id(0) >= TILE_SIZE) {
-                localData[tgx].fx = force.x;
-                localData[tgx].fy = force.y;
-                localData[tgx].fz = force.z;
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset1 = x*TILE_SIZE + tgx;
-                const unsigned int offset2 = y*TILE_SIZE + tgx;
-                atom_add(&forceBuffers[offset1], (long) ((force.x + localData[tgx].fx)*0x100000000));
-                atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0x100000000));
-                atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0x100000000));
-                atom_add(&forceBuffers[offset2], (long) ((localForce[tgx].x + localForce[tgx+TILE_SIZE].x)*0x100000000));
-                atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localForce[tgx].y + localForce[tgx+TILE_SIZE].y)*0x100000000));
-                atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localForce[tgx].z + localForce[tgx+TILE_SIZE].z)*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                // Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
-                real4 sum1 = forceBuffers[offset1];
-                real4 sum2 = forceBuffers[offset2];
-                sum1.x += localData[tgx].fx + force.x;
-                sum1.y += localData[tgx].fy + force.y;
-                sum1.z += localData[tgx].fz + force.z;
-                sum2.x += localForce[tgx].x + localForce[tgx+TILE_SIZE].x;
-                sum2.y += localForce[tgx].y + localForce[tgx+TILE_SIZE].y;
-                sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z;
-                forceBuffers[offset1] = sum1;
-                forceBuffers[offset2] = sum2;
-#endif
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        lasty = y;
-        pos++;
-    }
-    energyBuffer[get_global_id(0)] += energy;
-}
--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-#define TILE_SIZE 32
-#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
-typedef struct {
-    real x, y, z;
-    real q;
-    real fx, fy, fz;
-    ATOM_PARAMETER_DATA
-#ifndef PARAMETER_SIZE_IS_EVEN
-    real padding;
-#endif
-} AtomData;
-/**
- * Compute nonbonded interactions.
- */
-__kernel void computeNonbonded(
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers,
-#else
-        __global real4* restrict forceBuffers,
-#endif
-        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
-        unsigned int startTileIndex, unsigned int endTileIndex,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
-#else
-        unsigned int numTiles
-#endif
-        PARAMETER_ARGUMENTS) {
-    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    unsigned int warp = get_global_id(0)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*(endTileIndex-startTileIndex)/totalWarps : warp*numTiles/totalWarps);
-    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*(endTileIndex-startTileIndex)/totalWarps : (warp+1)*numTiles/totalWarps);
-#else
-    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
-    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
-#endif
-    real energy = 0;
-    __local AtomData localData[FORCE_WORK_GROUP_SIZE];
-    __local real tempBuffer[3*FORCE_WORK_GROUP_SIZE];
-    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __local int exclusionIndex[WARPS_PER_GROUP];
-    __local int2* reservedBlocks = (__local int2*) exclusionRange;
-    do {
-        // Extract the coordinates of this tile
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int x, y;
-        real4 force = 0;
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-            real4 posq1 = posq[atom1];
-            LOAD_ATOM1_PARAMETERS
-            // Locate the exclusion data for this tile.
-#ifdef USE_EXCLUSIONS
-            if (tgx < 2)
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-#else
-            bool hasExclusions = false;
-#endif
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-                const unsigned int localAtomIndex = get_local_id(0);
-                localData[localAtomIndex].x = posq1.x;
-                localData[localAtomIndex].y = posq1.y;
-                localData[localAtomIndex].z = posq1.z;
-                localData[localAtomIndex].q = posq1.w;
-                LOAD_LOCAL_PARAMETERS_FROM_1
-#ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
-#endif
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
-                    int atom2 = tbx+j;
-                    real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
-#ifdef USE_SYMMETRIC
-                    real dEdR = 0;
-#else
-                    real4 dEdR1 = (real4) 0;
-                    real4 dEdR2 = (real4) 0;
-#endif
-                    real tempEnergy = 0;
-                    COMPUTE_INTERACTION
-                    energy += 0.5f*tempEnergy;
-#ifdef USE_SYMMETRIC
-                    force.xyz -= delta.xyz*dEdR;
-#else
-                    force.xyz -= dEdR1.xyz;
-#endif
-#ifdef USE_EXCLUSIONS
-                    excl >>= 1;
-#endif
-                }
-            }
-            else {
-                // This is an off-diagonal tile.
-                const unsigned int localAtomIndex = get_local_id(0);
-                unsigned int j = y*TILE_SIZE + tgx;
-                real4 tempPosq = posq[j];
-                localData[localAtomIndex].x = tempPosq.x;
-                localData[localAtomIndex].y = tempPosq.y;
-                localData[localAtomIndex].z = tempPosq.z;
-                localData[localAtomIndex].q = tempPosq.w;
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                localData[localAtomIndex].fx = 0;
-                localData[localAtomIndex].fy = 0;
-                localData[localAtomIndex].fz = 0;
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                if (!hasExclusions && flags != 0xFFFFFFFF) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                bool isExcluded = false;
-                                int atom2 = tbx+j;
-                                int bufferIndex = 3*get_local_id(0);
-#ifdef USE_SYMMETRIC
-                                real dEdR = 0;
-#else
-                                real4 dEdR1 = (real4) 0;
-                                real4 dEdR2 = (real4) 0;
-#endif
-                                real tempEnergy = 0;
-                                real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                                if (r2 < CUTOFF_SQUARED) {
-#endif
-                                    real invR = RSQRT(r2);
-                                    real r = RECIP(invR);
-                                    LOAD_ATOM2_PARAMETERS
-                                    atom2 = y*TILE_SIZE+j;
-                                    COMPUTE_INTERACTION
-                                    energy += tempEnergy;
-#ifdef USE_CUTOFF
-                                }
-#endif
-#ifdef USE_SYMMETRIC
-                                delta.xyz *= dEdR;
-                                force.xyz -= delta.xyz;
-                                tempBuffer[bufferIndex] = delta.x;
-                                tempBuffer[bufferIndex+1] = delta.y;
-                                tempBuffer[bufferIndex+2] = delta.z;
-#else
-                                force.xyz -= dEdR1.xyz;
-                                tempBuffer[bufferIndex] = dEdR2.x;
-                                tempBuffer[bufferIndex+1] = dEdR2.y;
-                                tempBuffer[bufferIndex+2] = dEdR2.z;
-#endif
-                                // Sum the forces on atom2.
-                                if (tgx % 4 == 0) {
-                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
-                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
-                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
-                                }
-                                if (tgx == 0) {
-                                    localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-#endif
-                {
-                    // Compute the full set of interactions in this tile.
-#ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-#endif
-                    unsigned int tj = tgx;
-                    for (j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
-                        int atom2 = tbx+tj;
-                        real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
-#endif
-                            real invR = RSQRT(r2);
-                            real r = RECIP(invR);
-                            LOAD_ATOM2_PARAMETERS
-                            atom2 = y*TILE_SIZE+tj;
-#ifdef USE_SYMMETRIC
-                            real dEdR = 0;
-#else
-                            real4 dEdR1 = (real4) 0;
-                            real4 dEdR2 = (real4) 0;
-#endif
-                            real tempEnergy = 0;
-                            COMPUTE_INTERACTION
-                            energy += tempEnergy;
-#ifdef USE_SYMMETRIC
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
-                            localData[tbx+tj].fx += delta.x;
-                            localData[tbx+tj].fy += delta.y;
-                            localData[tbx+tj].fz += delta.z;
-#else
-                            force.xyz -= dEdR1.xyz;
-                            localData[tbx+tj].fx += dEdR2.x;
-                            localData[tbx+tj].fy += dEdR2.y;
-                            localData[tbx+tj].fz += dEdR2.z;
-#endif
-#ifdef USE_CUTOFF
-                        }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
-#endif
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                }
-            }
-        }
-        // Write results.  We need to coordinate between warps to make sure no two of them
-        // ever try to write to the same piece of memory at the same time.
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
-            atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-        }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
-            atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
-            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
-            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
-        }
-#else
-        int writeX = (pos < end ? x : -1);
-        int writeY = (pos < end && x != y ? y : -1);
-        if (tgx == 0)
-            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
-        bool done = false;
-        int doneIndex = 0;
-        int checkIndex = 0;
-        while (true) {
-            // See if any warp still needs to write its data.
-            bool allDone = true;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            while (doneIndex < WARPS_PER_GROUP && allDone) {
-                if (reservedBlocks[doneIndex].x != -1)
-                    allDone = false;
-                else
-                    doneIndex++;
-            }
-            if (allDone)
-                break;
-            if (!done) {
-                // See whether this warp can write its data.  This requires that no previous warp
-                // is trying to write to the same block of the buffer.
-                bool canWrite = (writeX != -1);
-                while (checkIndex < localGroupIndex && canWrite) {
-                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
-                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
-                        canWrite = false;
-                    else
-                        checkIndex++;
-                }
-                if (canWrite) {
-                    // Write the data to global memory, then mark this warp as done.
-                    if (writeX > -1) {
-                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        forceBuffers[offset].xyz += force.xyz;
-                    }
-                    if (writeY > -1) {
-                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        forceBuffers[offset] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
-                    }
-                    done = true;
-                    if (tgx == 0)
-                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
-                }
-            }
-        }
-#endif
-        pos++;
-    } while (pos < end);
-    energyBuffer[get_global_id(0)] += energy;
-}
--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -15,6 +15,7 @@ __kernel void updateBsplines(__global const real4* restrict posq, __global real4
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
        pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
+#ifndef SUPPORTS_64_BIT_ATOMICS
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
@@ -33,6 +34,7 @@ __kernel void updateBsplines(__global const real4* restrict posq, __global real4
            data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
        }
+#endif
    }
 }
@@ -80,56 +82,66 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
 #ifdef SUPPORTS_64_BIT_ATOMICS
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
+__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-__kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
-void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
+    const real4 scale = 1/(real) (PME_ORDER-1);
-    int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
+    real4 data[PME_ORDER];
-    int iy = remainder/PME_ORDER;
-    int iz = remainder-iy*PME_ORDER;
+    // Process the atoms in spatially sorted order.  This improves efficiency when writing
-    __local real4 theta[PME_ORDER];
+    // the grid values.
-    __local real charge[BUFFER_SIZE];
-    __local int basex[BUFFER_SIZE];
+    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
-    __local int basey[BUFFER_SIZE];
+        int atom = pmeAtomGridIndex[i].x;
-    __local int basez[BUFFER_SIZE];
+        real4 pos = posq[atom];
-    if (ix < PME_ORDER) {
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-        for (int baseIndex = get_group_id(0)*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += get_num_groups(0)*BUFFER_SIZE) {
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-            // Load the next block of atoms into the buffers.
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
+        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
+                                 ((int) t.y) % GRID_SIZE_Y,
+                                 ((int) t.z) % GRID_SIZE_Z, 0);
-            if (get_local_id(0) < BUFFER_SIZE) {
+        // Since we need the full set of thetas, it's faster to compute them here than load them
-                int atomIndex = baseIndex+get_local_id(0);
+        // from global memory.
-                if (atomIndex < NUM_ATOMS) {
-                    real4 pos = posq[atomIndex];
+        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
-                    charge[get_local_id(0)] = pos.w;
+        data[PME_ORDER-1] = 0.0f;
-                    pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        data[1] = dr;
-                    pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        data[0] = 1.0f-dr;
-                    pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        for (int j = 3; j < PME_ORDER; j++) {
-                    basex[get_local_id(0)] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
+            real div = RECIP(j-1.0f);
-                    basey[get_local_id(0)] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
+            data[j-1] = div*dr*data[j-2];
-                    basez[get_local_id(0)] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
+            for (int k = 1; k < (j-1); k++)
-                }
+                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
-            }
+            data[0] = div*(- dr+1.0f)*data[0];
-            barrier(CLK_LOCAL_MEM_FENCE);
+        }
-            int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-            for (int index = 0; index < lastIndex; index++) {
+        for (int j = 1; j < (PME_ORDER-1); j++)
-                int atomIndex = index+baseIndex;
+            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
-                if (get_local_id(0) < PME_ORDER)
+        data[0] = scale*(-dr+1.0f)*data[0];
-                    theta[get_local_id(0)] = pmeBsplineTheta[atomIndex+get_local_id(0)*NUM_ATOMS];
-                barrier(CLK_LOCAL_MEM_FENCE);
+        // Spread the charge from this atom onto each grid point.
-                real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
-                int x = basex[index]+ix;
+        for (int ix = 0; ix < PME_ORDER; ix++) {
-                int y = basey[index]+iy;
+            int xindex = gridIndex.x+ix;
-                int z = basez[index]+iz;
+            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-                x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            for (int iy = 0; iy < PME_ORDER; iy++) {
-                y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                int yindex = gridIndex.y+iy;
-                z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
+                    real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
 #ifdef USE_DOUBLE_PRECISION
-                atom_add(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], (long) (add*0x100000000));
+                    atom_add(&pmeGrid[2*index], (long) (add*0x100000000));
 #else
-                atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0x100000000));
+                    atom_add(&pmeGrid[index], (long) (add*0x100000000));
 #endif
+                }
            }
        }
    }
@@ -149,6 +161,75 @@ __kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
        realGrid[index] = realValue;
    }
 }
+#elif defined(DEVICE_IS_CPU)
+__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
+        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
+    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
+    if (firstx == lastx)
+        return;
+    const real4 scale = 1/(real) (PME_ORDER-1);
+    real4 data[PME_ORDER];
+    // Process the atoms in spatially sorted order.  This improves efficiency when writing
+    // the grid values.
+    for (int i = 0; i < NUM_ATOMS; i++) {
+        int atom = i;//pmeAtomGridIndex[i].x;
+        real4 pos = posq[atom];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
+        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
+                                 ((int) t.y) % GRID_SIZE_Y,
+                                 ((int) t.z) % GRID_SIZE_Z, 0);
+        // Spread the charge from this atom onto each grid point.
+        bool hasComputedThetas = false;
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xindex = gridIndex.x+ix;
+            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            if (xindex < firstx || xindex >= lastx)
+                continue;
+            if (!hasComputedThetas) {
+                hasComputedThetas = true;
+                // Since we need the full set of thetas, it's faster to compute them here than load them
+                // from global memory.
+                real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+                data[PME_ORDER-1] = 0.0f;
+                data[1] = dr;
+                data[0] = 1.0f-dr;
+                for (int j = 3; j < PME_ORDER; j++) {
+                    real div = RECIP(j-1.0f);
+                    data[j-1] = div*dr*data[j-2];
+                    for (int k = 1; k < (j-1); k++)
+                        data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
+                    data[0] = div*(- dr+1.0f)*data[0];
+                }
+                data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+                for (int j = 1; j < (PME_ORDER-1); j++)
+                    data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
+                data[0] = scale*(-dr+1.0f)*data[0];
+            }
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int yindex = gridIndex.y+iy;
+                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
+                    pmeGrid[index].x += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
+                }
+            }
+        }
+    }
+}
 #else
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
@@ -239,11 +320,16 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
 }
 __kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, __local real4* restrict bsplinesCache) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, __global int2* restrict pmeAtomGridIndex) {
    const real4 scale = 1/(real) (PME_ORDER-1);
-    __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
+    real4 data[PME_ORDER];
-    __local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
+    real4 ddata[PME_ORDER];
-    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    // the grid values.
+    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
+        int atom = pmeAtomGridIndex[i].x;
        real4 force = 0.0f;
        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -293,29 +379,10 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
                    real gridvalue = pmeGrid[index].x;
                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
-#ifndef MAC_AMD_WORKAROUND
-                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
-#endif
-                }
-            }
-        }
-#ifdef MAC_AMD_WORKAROUND
-        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xindex = gridIndex.x+ix;
-            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int yindex = gridIndex.y+iy;
-                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    int zindex = gridIndex.z+iz;
-                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    real gridvalue = pmeGrid[index].x;
                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
                }
            }
        }
-#endif
        real4 totalForce = forceBuffers[atom];
        real q = pos.w*EPSILON_FACTOR;
        totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;

--- a/platforms/opencl/src/kernels/pme_cpu.cl
+++ b/platforms/opencl/src/kernels/pme_cpu.cl
-__kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache, __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global real4* restrict pmeBsplineDTheta) {
-    const real4 scale = 1.0f/(PME_ORDER-1);
-    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
-        __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
-        __local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
-        for (int j = 0; j < PME_ORDER; j++) {
-	    data[j] = 0;
-            ddata[j] = 0;
-        }
-        real4 pos = posq[i];
-        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
-                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
-                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
-        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0);
-        data[PME_ORDER-1] = 0;
-        data[1] = dr;
-        data[0] = 1.0f-dr;
-        for (int j = 3; j < PME_ORDER; j++) {
-            real div = 1.0f/(j-1.0f);
-            data[j-1] = div*dr*data[j-2];
-            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
-            data[0] = div*(- dr+1.0f)*data[0];
-        }
-        ddata[0] = -data[0];
-        for (int j = 1; j < PME_ORDER; j++)
-            ddata[j] = data[j-1]-data[j];
-        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
-        data[0] = scale*(-dr+1.0f)*data[0];
-        for (int j = 0; j < PME_ORDER; j++) {
-            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
-            pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
-        }
-    }
-}
-/**
- * This kernel is not actually used when running on a CPU.
- */
-__kernel void findAtomRangeForGrid(__global const int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-}
-__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
-    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
-    for (int gridIndex = firstx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex < lastx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex++)
-        pmeGrid[gridIndex] = (real2) 0;
-    for (int atom = 0; atom < NUM_ATOMS; atom++) {
-        real4 pos = posq[atom];
-        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
-                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
-                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
-        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0);
-        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
-                                 ((int) t.y) % GRID_SIZE_Y,
-                                 ((int) t.z) % GRID_SIZE_Z, 0);
-        real atomCharge = pos.w*EPSILON_FACTOR;
-        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xindex = gridIndex.x+ix;
-            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-            if (xindex < firstx || xindex >= lastx)
-                continue;
-            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int yindex = gridIndex.y+iy;
-                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                for(int iz = 0; iz < PME_ORDER; iz++) {
-                    int zindex = gridIndex.z+iz;
-                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    pmeGrid[index].x += atomCharge*pmeBsplineTheta[atom+ix*NUM_ATOMS].x*pmeBsplineTheta[atom+iy*NUM_ATOMS].y*pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
-                }
-            }
-        }
-    }
-}
-__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
-        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 invPeriodicBoxSize, real recipScaleFactor) {
-    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    real energy = 0;
-    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
-        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
-        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
-        int ky = remainder/GRID_SIZE_Z;
-        int kz = remainder-ky*GRID_SIZE_Z;
-        if (kx == 0 && ky == 0 && kz == 0)
-            continue;
-        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
-        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
-        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
-        real mhx = mx*invPeriodicBoxSize.x;
-        real mhy = my*invPeriodicBoxSize.y;
-        real mhz = mz*invPeriodicBoxSize.z;
-        real bx = pmeBsplineModuliX[kx];
-        real by = pmeBsplineModuliY[ky];
-        real bz = pmeBsplineModuliZ[kz];
-        real2 grid = pmeGrid[index];
-        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
-        real denom = m2*bx*by*bz;
-        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
-        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
-        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
-    }
-    energyBuffer[get_global_id(0)] += 0.5f*energy;
-}
-__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict pmeBsplineTheta, __global const real4* restrict pmeBsplineDTheta) {
-    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
-        real4 force = 0;
-        real4 pos = posq[atom];
-        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
-                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
-                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
-        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
-                                 ((int) t.y) % GRID_SIZE_Y,
-                                 ((int) t.z) % GRID_SIZE_Z, 0);
-        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xindex = gridIndex.x+ix;
-            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-            real tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
-            real dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
-            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int yindex = gridIndex.y+iy;
-                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                real ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
-                real dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
-                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    int zindex = gridIndex.z+iz;
-                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-                    real tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
-                    real dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
-                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    real gridvalue = pmeGrid[index].x;
-                    force.x += dtx*ty*tz*gridvalue;
-                    force.y += tx*dty*tz*gridvalue;
-                    force.z += tx*ty*dtz*gridvalue;
-                }
-            }
-        }
-        real4 totalForce = forceBuffers[atom];
-        real q = pos.w*EPSILON_FACTOR;
-        totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
-        totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
-        totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;
-        forceBuffers[atom] = totalForce;
-    }
-}
--- a/platforms/opencl/src/kernels/random.cl
+++ b/platforms/opencl/src/kernels/random.cl
@@ -25,7 +25,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x1 = sqrt(-2.0f * log(x1));
+        x1 = SQRT(-2.0f * LOG(x1));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
@@ -50,7 +50,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x3 = sqrt(-2.0f * log(x3));
+        x3 = SQRT(-2.0f * LOG(x3));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
@@ -75,7 +75,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x5 = sqrt(-2.0f * log(x5));
+        x5 = SQRT(-2.0f * LOG(x5));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
@@ -100,7 +100,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x7 = sqrt(-2.0f * log(x7));
+        x7 = SQRT(-2.0f * LOG(x7));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;

--- a/platforms/opencl/src/kernels/settle.cl
+++ b/platforms/opencl/src/kernels/settle.cl
@@ -63,9 +63,9 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest
        mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
        mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
-        mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
+        mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
-        mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
+        mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
-        mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
+        mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
        mixed trns11 = xaksXd / axlng;
        mixed trns21 = yaksXd / axlng;
        mixed trns31 = zaksXd / axlng;
@@ -91,13 +91,13 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest
        //                                        --- Step2  A2' ---
        float rc = 0.5*params.y;
-        mixed rb = sqrt(params.x*params.x-rc*rc);
+        mixed rb = SQRT(params.x*params.x-rc*rc);
        mixed ra = rb*(m1+m2)*invTotalMass;
        rb -= ra;
        mixed sinphi = za1d / ra;
-        mixed cosphi = sqrt(1.0f - sinphi*sinphi);
+        mixed cosphi = SQRT(1.0f - sinphi*sinphi);
        mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
-        mixed cospsi = sqrt(1.0f - sinpsi*sinpsi);
+        mixed cospsi = SQRT(1.0f - sinpsi*sinpsi);
        mixed ya2d =   ra*cosphi;
        mixed xb2d = - rc*cospsi;
@@ -105,7 +105,7 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest
        mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
        mixed xb2d2 = xb2d*xb2d;
        mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
-        mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
+        mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
        xb2d -= deltx*0.5;
        //                                        --- Step3  al,be,ga ---
@@ -115,11 +115,11 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest
        mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
        mixed al2be2 = alpha*alpha + beta*beta;
-        mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
+        mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;
        //                                        --- Step4  A3' ---
-        mixed costheta = sqrt(1.0f - sintheta*sintheta);
+        mixed costheta = SQRT(1.0f - sintheta*sintheta);
        mixed xa3d = - ya2d*sintheta;
        mixed ya3d =   ya2d*costheta;
        mixed za3d = za1d;
@@ -186,9 +186,9 @@ __kernel void constrainVelocities(int numClusters, mixed tol, __global const rea
        mixed4 eAB = apos1-apos0;
        mixed4 eBC = apos2-apos1;
        mixed4 eCA = apos0-apos2;
-        eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
+        eAB.xyz /= SQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
-        eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
+        eBC.xyz /= SQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
-        eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
+        eCA.xyz /= SQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
        mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
        mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
        mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;

--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
@@ -4,6 +4,47 @@ KEY_TYPE getValue(DATA_TYPE value) {
    return SORT_KEY;
 }
+/**
+ * Sort a list that is short enough to entirely fit in local memory.  This is executed as
+ * a single thread block.
+ */
+__kernel void sortShortList(__global DATA_TYPE* __restrict__ data, uint length, __local DATA_TYPE* dataBuffer) {
+    // Load the data into local memory.
+    for (int index = get_local_id(0); index < length; index += get_local_size(0))
+        dataBuffer[index] = data[index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // Perform a bitonic sort in local memory.
+    for (unsigned int k = 2; k < 2*length; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            for (unsigned int i = get_local_id(0); i < length; i += get_local_size(0)) {
+                int ixj = i^j;
+                if (ixj > i && ixj < length) {
+                    DATA_TYPE value1 = dataBuffer[i];
+                    DATA_TYPE value2 = dataBuffer[ixj];
+                    bool ascending = ((i&k) == 0);
+                    for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
+                        ascending = ((i&mask) == 0 ? !ascending : ascending);
+                    KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                    if (lowKey > highKey) {
+                        dataBuffer[i] = value2;
+                        dataBuffer[ixj] = value1;
+                    }
+                }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+    }
+    // Write the data back to global memory.
+    for (int index = get_local_id(0); index < length; index += get_local_size(0))
+        data[index] = dataBuffer[index];
+}
 /**
 * Calculate the minimum and maximum value in the array to be sorted.  This kernel
 * is executed as a single work group.

--- a/platforms/opencl/src/kernels/torsionForce.cl
+++ b/platforms/opencl/src/kernels/torsionForce.cl
@@ -11,7 +11,7 @@ if (cosangle > 0.99f || cosangle < -0.99f) {
    real4 cross_prod = cross(cp0, cp1);
    real scale = dot(cp0, cp0)*dot(cp1, cp1);
-    theta = asin(sqrt(dot(cross_prod, cross_prod)/scale));
+    theta = asin(SQRT(dot(cross_prod, cross_prod)/scale));
    if (cosangle < 0)
        theta = PI-theta;
 }
@@ -21,7 +21,7 @@ theta = (dot(v0, cp1) >= 0 ? theta : -theta);
 COMPUTE_FORCE
 real normCross1 = dot(cp0, cp0);
 real normSqrBC = dot(v1, v1);
-real normBC = sqrt(normSqrBC);
+real normBC = SQRT(normSqrBC);
 real normCross2 = dot(cp1, cp1);
 real dp = 1.0f/normSqrBC;
 real4 ff = (real4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);

--- a/platforms/opencl/src/kernels/verlet.cl
+++ b/platforms/opencl/src/kernels/verlet.cl
@@ -98,8 +98,8 @@ __kernel void selectVerletStepSize(int numAtoms, mixed maxStepSize, mixed errorT
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (get_local_id(0) == 0) {
-        mixed totalError = sqrt(error[0]/(numAtoms*3));
+        mixed totalError = SQRT(error[0]/(numAtoms*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.

--- a/platforms/opencl/src/kernels/virtualSites.cl
+++ b/platforms/opencl/src/kernels/virtualSites.cl
@@ -26,9 +26,16 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
 /**
 * Compute the positions of virtual sites
 */
-__kernel void computeVirtualSites(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const int4* restrict avg2Atoms,
+__kernel void computeVirtualSites(__global real4* restrict posq,
-        __global const real2* restrict avg2Weights, __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
+#ifdef USE_MIXED_PRECISION
+        __global real4* restrict posqCorrection,
+#endif
+        __global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
+        __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
        __global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights) {
+#ifndef USE_MIXED_PRECISION
+        __global real4* posqCorrection = 0;
+#endif
    // Two particle average sites.
@@ -74,11 +81,17 @@ __kernel void computeVirtualSites(__global real4* restrict posq, __global real4*
 /**
 * Distribute forces from virtual sites to the atoms they are based on.
 */
-__kernel void distributeForces(__global const real4* restrict posq, __global real4* restrict posqCorrection, __global real4* restrict force,
+__kernel void distributeForces(__global const real4* restrict posq, __global real4* restrict force,
+#ifdef USE_MIXED_PRECISION
+        __global real4* restrict posqCorrection,
+#endif
        __global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
        __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
        __global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights) {
+#ifndef USE_MIXED_PRECISION
+        __global real4* posqCorrection = 0;
+#endif
    // Two particle average sites.
    for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) {

--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -59,7 +59,7 @@ void testTransform() {
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
-    int xsize = 32, ysize = 25, zsize = 30;
+    int xsize = 28, ysize = 25, zsize = 30;
    vector<Real2> original(xsize*ysize*zsize);
    vector<t_complex> reference(original.size());
    for (int i = 0; i < (int) original.size(); i++) {
@@ -81,8 +81,8 @@ void testTransform() {
    fftpack_init_3d(&plan, xsize, ysize, zsize);
    fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
    for (int i = 0; i < (int) result.size(); ++i) {
-        ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-4);
+        ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-3);
-        ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-4);
+        ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-3);
    }
    fftpack_destroy(plan);

--- a/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
@@ -438,7 +438,7 @@ void testLargeSystem() {
    }
    ASSERT_EQUAL_TOL(clState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }
+/*
 void testBlockInteractions(bool periodic) {
    const int blockSize = 32;
    const int numBlocks = 100;
@@ -619,13 +619,13 @@ void testBlockInteractions(bool periodic) {
            }
        }
 }
+*/
 void testDispersionCorrection() {
    // Create a box full of identical particles.
    int gridSize = 5;
    int numParticles = gridSize*gridSize*gridSize;
-    double boxSize = gridSize*0.5;
+    double boxSize = gridSize*0.7;
    double cutoff = boxSize/3;
    System system;
    VerletIntegrator integrator(0.01);
@@ -827,8 +827,8 @@ int main(int argc, char* argv[]) {
        testCutoff14();
        testPeriodic();
        testLargeSystem();
-        testBlockInteractions(false);
+//        testBlockInteractions(false);
-        testBlockInteractions(true);
+//        testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
        testParallelComputation(false);

--- a/platforms/opencl/tests/TestOpenCLSort.cpp
+++ b/platforms/opencl/tests/TestOpenCLSort.cpp
@@ -48,15 +48,15 @@ using namespace std;
 OpenCLPlatform platform;
-struct SortTrait {
+class SortTrait : public OpenCLSort::SortTrait {
-    typedef cl_float DataType;
+    int getDataSize() const {return 4;}
-    typedef cl_float KeyType;
+    int getKeySize() const {return 4;}
-    static const char* clDataType() {return "float";}
+    const char* getDataType() const {return "float";}
-    static const char* clKeyType() {return "float";}
+    const char* getKeyType() const {return "float";}
-    static const char* clMinKey() {return "-MAXFLOAT";}
+    const char* getMinKey() const {return "-MAXFLOAT";}
-    static const char* clMaxKey() {return "MAXFLOAT";}
+    const char* getMaxKey() const {return "MAXFLOAT";}
-    static const char* clMaxValue() {return "MAXFLOAT";}
+    const char* getMaxValue() const {return "MAXFLOAT";}
-    static const char* clSortKey() {return "value";}
+    const char* getSortKey() const {return "value";}
 };
 void verifySorting(vector<float> array) {
@@ -69,7 +69,7 @@ void verifySorting(vector<float> array) {
    context.initialize();
    OpenCLArray data(context, array.size(), sizeof(float), "sortData");
    data.upload(array);
-    OpenCLSort<SortTrait> sort(context, array.size());
+    OpenCLSort sort(context, new SortTrait(), array.size());
    sort.sort(data);
    vector<float> sorted;
    data.download(sorted);
@@ -86,8 +86,7 @@ void verifySorting(vector<float> array) {
    ASSERT(elements1 == elements2);
 }
-void testUniformValues()
+void testUniformValues() {
-{
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
@@ -97,8 +96,7 @@ void testUniformValues()
    verifySorting(array);
 }
-void testLogValues()
+void testLogValues() {
-{
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
@@ -108,12 +106,23 @@ void testLogValues()
    verifySorting(array);
 }
+void testShortList() {
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<float> array(500);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) log(genrand_real2(sfmt));
+    verifySorting(array);
+}
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
        testUniformValues();
        testLogValues();
+        testShortList();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/reference/tests/TestReferenceNonbondedForce.cpp
+++ b/platforms/reference/tests/TestReferenceNonbondedForce.cpp
@@ -356,7 +356,7 @@ void testDispersionCorrection() {
    int gridSize = 5;
    int numParticles = gridSize*gridSize*gridSize;
-    double boxSize = gridSize*0.5;
+    double boxSize = gridSize*0.7;
    double cutoff = boxSize/3;
    ReferencePlatform platform;
    System system;

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -787,8 +787,8 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL),
        field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL),
        inducedDipole(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
-        pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeTheta1(NULL), pmeTheta2(NULL), pmeTheta3(NULL),
+        pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
-        pmeIgrid(NULL), pmePhi(NULL), pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), gkKernel(NULL) {
+        pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeAtomGridIndex(NULL), sort(NULL), gkKernel(NULL) {
 }
 CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
@@ -835,12 +835,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete pmeBsplineModuliY;
    if (pmeBsplineModuliZ != NULL)
        delete pmeBsplineModuliZ;
-    if (pmeTheta1 != NULL)
-        delete pmeTheta1;
-    if (pmeTheta2 != NULL)
-        delete pmeTheta2;
-    if (pmeTheta3 != NULL)
-        delete pmeTheta3;
    if (pmeIgrid != NULL)
        delete pmeIgrid;
    if (pmePhi != NULL)
@@ -851,8 +845,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete pmePhip;
    if (pmePhidp != NULL)
        delete pmePhidp;
-    if (pmeAtomRange != NULL)
-        delete pmeAtomRange;
    if (pmeAtomGridIndex != NULL)
        delete pmeAtomGridIndex;
    if (sort != NULL)
@@ -987,6 +979,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            if (find(atoms12.begin(), atoms12.end(), atoms[j]) == atoms12.end())
                polarizationFlagValues.push_back(make_int2(i, atoms[j]));
    }
+    set<pair<int, int> > tilesWithExclusions;
+    for (int atom1 = 0; atom1 < (int) exclusions.size(); ++atom1) {
+        int x = atom1/CudaContext::TileSize;
+        for (int j = 0; j < (int) exclusions[atom1].size(); ++j) {
+            int atom2 = exclusions[atom1][j];
+            int y = atom2/CudaContext::TileSize;
+            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
+        }
+    }
    // Record other options.
@@ -1024,6 +1025,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        defines["DIRECT_POLARIZATION"] = "";
    if (useShuffle)
        defines["USE_SHUFFLE"] = "";
+    defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
+    int numExclusionTiles = tilesWithExclusions.size();
+    defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
+    int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
+    defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
+    defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
    double alpha = force.getAEwald();
    int gridSizeX, gridSizeY, gridSizeZ;
    if (usePME) {
@@ -1128,17 +1137,20 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        if (force.getPolarizationType() == AmoebaMultipoleForce::Direct)
            pmeDefines["DIRECT_POLARIZATION"] = "";
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
-        pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
+        pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
-        pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
-        pmeZIndexKernel = cu.getKernel(module, "recordZIndex");
        pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
        pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles");
+        pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
        pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
        pmeFixedPotentialKernel = cu.getKernel(module, "computeFixedPotentialFromGrid");
        pmeInducedPotentialKernel = cu.getKernel(module, "computeInducedPotentialFromGrid");
        pmeFixedForceKernel = cu.getKernel(module, "computeFixedMultipoleForceAndEnergy");
        pmeInducedForceKernel = cu.getKernel(module, "computeInducedDipoleForceAndEnergy");
        pmeRecordInducedFieldDipolesKernel = cu.getKernel(module, "recordInducedFieldDipoles");
+        cuFuncSetCacheConfig(pmeSpreadFixedMultipolesKernel, CU_FUNC_CACHE_PREFER_L1);
+        cuFuncSetCacheConfig(pmeSpreadInducedDipolesKernel, CU_FUNC_CACHE_PREFER_L1);
+        cuFuncSetCacheConfig(pmeFixedPotentialKernel, CU_FUNC_CACHE_PREFER_L1);
+        cuFuncSetCacheConfig(pmeInducedPotentialKernel, CU_FUNC_CACHE_PREFER_L1);
        // Create required data structures.
@@ -1148,9 +1160,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
        pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
        pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
-        pmeTheta1 = new CudaArray(cu, PmeOrder*numMultipoles, 4*elementSize, "pmeTheta1");
-        pmeTheta2 = new CudaArray(cu, PmeOrder*numMultipoles, 4*elementSize, "pmeTheta2");
-        pmeTheta3 = new CudaArray(cu, PmeOrder*numMultipoles, 4*elementSize, "pmeTheta3");
        pmeIgrid = CudaArray::create<int4>(cu, numMultipoles, "pmeIgrid");
        pmePhi = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhi");
        pmePhid = new CudaArray(cu, 10*numMultipoles, elementSize, "pmePhid");
@@ -1264,6 +1273,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    // just so that CudaNonbondedUtilities will build the exclusion flags and maintain the neighbor list.
    cu.getNonbondedUtilities().addInteraction(usePME, usePME, true, force.getCutoffDistance(), exclusions, "", force.getForceGroup());
+    cu.getNonbondedUtilities().setUsePadding(false);
    cu.addForce(new ForceInfo(force));
 }
@@ -1272,11 +1282,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    // Figure out the covalent flag values to use for each atom pair.
-    vector<unsigned int> exclusionIndices;
+    vector<ushort2> exclusionTiles;
-    vector<unsigned int> exclusionRowIndices;
+    nb.getExclusionTiles().download(exclusionTiles);
-    nb.getExclusionIndices().download(exclusionIndices);
+    map<pair<int, int>, int> exclusionTileMap;
-    nb.getExclusionRowIndices().download(exclusionRowIndices);
+    for (int i = 0; i < (int) exclusionTiles.size(); i++) {
+        ushort2 tile = exclusionTiles[i];
+        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
+    }
    covalentFlags = CudaArray::create<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags");
    vector<uint2> covalentFlagsVec(nb.getExclusions().getSize(), make_uint2(0, 0));
    for (int i = 0; i < (int) covalentFlagValues.size(); i++) {
@@ -1290,19 +1303,19 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
        int f1 = (value == 0 || value == 1 ? 1 : 0);
        int f2 = (value == 0 || value == 2 ? 1 : 0);
        if (x == y) {
-            int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices);
+            int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
            covalentFlagsVec[index+offset1].x |= f1<<offset2;
            covalentFlagsVec[index+offset1].y |= f2<<offset2;
            covalentFlagsVec[index+offset2].x |= f1<<offset1;
            covalentFlagsVec[index+offset2].y |= f2<<offset1;
        }
        else if (x > y) {
-            int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices);
+            int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
            covalentFlagsVec[index+offset1].x |= f1<<offset2;
            covalentFlagsVec[index+offset1].y |= f2<<offset2;
        }
        else {
-            int index = CudaNonbondedUtilities::findExclusionIndex(y, x, exclusionIndices, exclusionRowIndices);
+            int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
            covalentFlagsVec[index+offset2].x |= f1<<offset1;
            covalentFlagsVec[index+offset2].y |= f2<<offset1;
        }
@@ -1321,16 +1334,16 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
        int y = atom2/CudaContext::TileSize;
        int offset2 = atom2-y*CudaContext::TileSize;
        if (x == y) {
-            int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices);
+            int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
            polarizationGroupFlagsVec[index+offset1] |= 1<<offset2;
            polarizationGroupFlagsVec[index+offset2] |= 1<<offset1;
        }
        else if (x > y) {
-            int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices);
+            int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
            polarizationGroupFlagsVec[index+offset1] |= 1<<offset2;
        }
        else {
-            int index = CudaNonbondedUtilities::findExclusionIndex(y, x, exclusionIndices, exclusionRowIndices);
+            int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
            polarizationGroupFlagsVec[index+offset2] |= 1<<offset1;
        }
    }
@@ -1364,8 +1377,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        if (gkKernel == NULL) {
            void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+                &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
-                &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
                &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
            cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
            void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
@@ -1375,8 +1387,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        else {
            gkKernel->computeBornRadii();
            void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+                &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
-                &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
                &gkKernel->getBornRadii()->getDevicePointer(), &gkKernel->getField()->getDevicePointer(),
                &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
            cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
@@ -1395,7 +1406,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            cu.clearBuffer(*inducedFieldPolar);
            if (gkKernel == NULL) {
                void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                    &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
+                    &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
                    &dampingAndThole->getDevicePointer()};
                cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
            }
@@ -1403,7 +1414,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
                cu.clearBuffer(*gkKernel->getInducedField());
                cu.clearBuffer(*gkKernel->getInducedFieldPolar());
                void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                    &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
+                    &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
                    &gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(),
                    &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
                    &gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()};
@@ -1431,8 +1442,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Compute electrostatic force.
        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
-            &cu.getPosq().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+            &cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(),
-            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
+            &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
@@ -1443,20 +1454,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Reciprocal space calculation.
        unsigned int maxTiles = nb.getInteractingTiles().getSize();
-        void* pmeUpdateBsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
+        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
-            &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-            cu.getInvPeriodicBoxSizePointer()};
+        cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
-        cu.executeKernel(pmeUpdateBsplinesKernel, pmeUpdateBsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
        sort->sort(*pmeAtomGridIndex);
-        void* pmeAtomRangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(),
-            &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-        cu.executeKernel(pmeAtomRangeKernel, pmeAtomRangeArgs, cu.getNumAtoms());
-        void* pmeZIndexArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-        cu.executeKernel(pmeZIndexKernel, pmeZIndexArgs, cu.getNumAtoms());
        void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
-            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(),
+            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),  cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-            &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
        cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
+        void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
+        if (cu.getUseDoublePrecision())
+            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
        else
@@ -1469,8 +1476,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        else
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(),
-            &fieldPolar ->getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(),
+            &fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
-            &pmeTheta3->getDevicePointer(), &labFrameDipoles->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
+            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
        cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
        void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
            &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
@@ -1480,10 +1487,9 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Direct space calculation.
        void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-            &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
-            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
-            cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
+            cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
        void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
@@ -1492,10 +1498,12 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Reciprocal space calculation for the induced dipoles.
+        cu.clearBuffer(*pmeGrid);
        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
-            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(),
+            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-            &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
+        if (cu.getUseDoublePrecision())
+            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
        else
@@ -1506,8 +1514,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        else
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
-            &pmePhidp->getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(),
+            &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(),
-            &pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
+            &pmeAtomGridIndex->getDevicePointer()};
        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
        // Iterate until the dipoles converge.
@@ -1517,12 +1525,15 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            cu.clearBuffer(*inducedField);
            cu.clearBuffer(*inducedFieldPolar);
            void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
+                &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
                &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
-                cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
+                cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
                &dampingAndThole->getDevicePointer()};
            cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
+            cu.clearBuffer(*pmeGrid);
            cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
+            if (cu.getUseDoublePrecision())
+                cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
            if (cu.getUseDoublePrecision())
                cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
            else
@@ -1553,10 +1564,10 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Compute electrostatic force.
        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
-            &cu.getPosq().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+            &cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(),
-            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
+            &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(),
-            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
+            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
@@ -1811,7 +1822,7 @@ private:
 };
 CudaCalcAmoebaGeneralizedKirkwoodForceKernel::CudaCalcAmoebaGeneralizedKirkwoodForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : 
-           CalcAmoebaGeneralizedKirkwoodForceKernel(name, platform), cu(cu), system(system), params(NULL), bornRadii(NULL), field(NULL),
+           CalcAmoebaGeneralizedKirkwoodForceKernel(name, platform), cu(cu), system(system), hasInitializedKernels(false), params(NULL), bornRadii(NULL), field(NULL),
           inducedField(NULL), inducedFieldPolar(NULL), inducedDipoleS(NULL), inducedDipolePolarS(NULL), bornSum(NULL), bornForce(NULL) {
 }
@@ -1892,9 +1903,8 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
    chainRuleThreads = min(maxThreads, cu.computeThreadBlockSize(chainRuleThreadMemory));
    ediffThreads = min(maxThreads, cu.computeThreadBlockSize(ediffThreadMemory));
-    // Create the kernels.
+    // Set preprocessor macros we will use when we create the kernels.
-    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cu.intToString(paddedNumAtoms);
    defines["BORN_SUM_THREAD_BLOCK_SIZE"] = cu.intToString(computeBornSumThreads);
@@ -1918,42 +1928,6 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
        defines["PROBE_RADIUS"] = cu.doubleToString(force.getProbeRadius());
        defines["DIELECTRIC_OFFSET"] = cu.doubleToString(0.009);
    }
-    stringstream forceSource;
-    forceSource << CudaKernelSources::vectorOps;
-    forceSource << CudaAmoebaKernelSources::amoebaGk;
-    forceSource << "#define F1\n";
-    forceSource << CudaAmoebaKernelSources::gkPairForce1;
-    forceSource << CudaAmoebaKernelSources::gkPairForce2;
-    forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
-    forceSource << "#undef F1\n";
-    forceSource << "#define F2\n";
-    forceSource << CudaAmoebaKernelSources::gkPairForce1;
-    forceSource << CudaAmoebaKernelSources::gkPairForce2;
-    forceSource << "#undef F2\n";
-    forceSource << "#define T1\n";
-    forceSource << CudaAmoebaKernelSources::gkPairForce1;
-    forceSource << CudaAmoebaKernelSources::gkPairForce2;
-    forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
-    forceSource << "#undef T1\n";
-    forceSource << "#define T2\n";
-    forceSource << CudaAmoebaKernelSources::gkPairForce1;
-    forceSource << CudaAmoebaKernelSources::gkPairForce2;
-    forceSource << "#undef T2\n";
-    forceSource << "#define T3\n";
-    forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
-    forceSource << "#undef T3\n";
-    forceSource << "#define B1\n";
-    forceSource << "#define B2\n";
-    forceSource << CudaAmoebaKernelSources::gkPairForce1;
-    forceSource << CudaAmoebaKernelSources::gkPairForce2;
-    CUmodule module = cu.createModule(forceSource.str(), defines);
-    computeBornSumKernel = cu.getKernel(module, "computeBornSum");
-    reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
-    gkForceKernel = cu.getKernel(module, "computeGKForces");
-    chainRuleKernel = cu.getKernel(module, "computeChainRuleForce");
-    ediffKernel = cu.getKernel(module, "computeEDiffForce");
-    if (includeSurfaceArea)
-        surfaceAreaKernel = cu.getKernel(module, "computeSurfaceAreaForce");
    cu.addForce(new ForceInfo(force));
 }
@@ -1964,6 +1938,55 @@ double CudaCalcAmoebaGeneralizedKirkwoodForceKernel::execute(ContextImpl& contex
 }
 void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::computeBornRadii() {
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+        // Create the kernels.
+        int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
+        int numContexts = cu.getPlatformData().contexts.size();
+        int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
+        int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
+        defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
+        defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
+        stringstream forceSource;
+        forceSource << CudaKernelSources::vectorOps;
+        forceSource << CudaAmoebaKernelSources::amoebaGk;
+        forceSource << "#define F1\n";
+        forceSource << CudaAmoebaKernelSources::gkPairForce1;
+        forceSource << CudaAmoebaKernelSources::gkPairForce2;
+        forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
+        forceSource << "#undef F1\n";
+        forceSource << "#define F2\n";
+        forceSource << CudaAmoebaKernelSources::gkPairForce1;
+        forceSource << CudaAmoebaKernelSources::gkPairForce2;
+        forceSource << "#undef F2\n";
+        forceSource << "#define T1\n";
+        forceSource << CudaAmoebaKernelSources::gkPairForce1;
+        forceSource << CudaAmoebaKernelSources::gkPairForce2;
+        forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
+        forceSource << "#undef T1\n";
+        forceSource << "#define T2\n";
+        forceSource << CudaAmoebaKernelSources::gkPairForce1;
+        forceSource << CudaAmoebaKernelSources::gkPairForce2;
+        forceSource << "#undef T2\n";
+        forceSource << "#define T3\n";
+        forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
+        forceSource << "#undef T3\n";
+        forceSource << "#define B1\n";
+        forceSource << "#define B2\n";
+        forceSource << CudaAmoebaKernelSources::gkPairForce1;
+        forceSource << CudaAmoebaKernelSources::gkPairForce2;
+        CUmodule module = cu.createModule(forceSource.str(), defines);
+        computeBornSumKernel = cu.getKernel(module, "computeBornSum");
+        reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
+        gkForceKernel = cu.getKernel(module, "computeGKForces");
+        chainRuleKernel = cu.getKernel(module, "computeChainRuleForce");
+        ediffKernel = cu.getKernel(module, "computeEDiffForce");
+        if (includeSurfaceArea)
+            surfaceAreaKernel = cu.getKernel(module, "computeSurfaceAreaForce");
+    }
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    int numTiles = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
@@ -2002,8 +2025,8 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::finishComputation(CudaArray&
        &params->getDevicePointer(), &bornRadii->getDevicePointer(), &bornForce->getDevicePointer()};
    cu.executeKernel(chainRuleKernel, chainRuleArgs, numForceThreadBlocks*chainRuleThreads, chainRuleThreads);    
    void* ediffArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
-        &cu.getPosq().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+        &cu.getPosq().getDevicePointer(), &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(),
-        &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(), &startTileIndex, &numTileIndices,
+        &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
        &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(), &inducedDipole.getDevicePointer(),
        &inducedDipolePolar.getDevicePointer(), &inducedDipoleS->getDevicePointer(), &inducedDipolePolarS->getDevicePointer(),
        &dampingAndThole.getDevicePointer()};

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -398,9 +398,6 @@ private:
    CudaArray* pmeBsplineModuliX;
    CudaArray* pmeBsplineModuliY;
    CudaArray* pmeBsplineModuliZ;
-    CudaArray* pmeTheta1;
-    CudaArray* pmeTheta2;
-    CudaArray* pmeTheta3;
    CudaArray* pmeIgrid;
    CudaArray* pmePhi;
    CudaArray* pmePhid;
@@ -411,8 +408,8 @@ private:
    CudaSort* sort;
    cufftHandle fft;
    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
-    CUfunction pmeUpdateBsplinesKernel, pmeAtomRangeKernel, pmeZIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeConvolutionKernel, pmeFixedPotentialKernel, pmeInducedPotentialKernel;
+    CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
-    CUfunction pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
+    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
    CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
    static const int PmeOrder = 5;
 };
@@ -477,8 +474,9 @@ private:
    class ForceInfo;
    CudaContext& cu;
    System& system;
-    bool includeSurfaceArea;
+    bool includeSurfaceArea, hasInitializedKernels;
    int computeBornSumThreads, gkForceThreads, chainRuleThreads, ediffThreads;
+    std::map<std::string, std::string> defines;
    CudaArray* params;
    CudaArray* bornSum;
    CudaArray* bornRadii;