Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · 93c467b2 · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
6 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
@@ -606,181 +606,255 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
 */
 extern "C" __global__ void computeEDiffForce(
        unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
-        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
-        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
+        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
        const real* __restrict__ inducedDipolePolar, const real* __restrict__ inducedDipoleS, const real* __restrict__ inducedDipolePolarS,
        const float2* __restrict__ dampingAndThole) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-    const unsigned int numTiles = numTileIndices;
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
+    const unsigned int tbx = threadIdx.x - tgx;
-    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
    real energy = 0;
    __shared__ AtomData4 localData[EDIFF_THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*(EDIFF_THREAD_BLOCK_SIZE/TILE_SIZE)];
-    __shared__ int exclusionIndex[EDIFF_THREAD_BLOCK_SIZE/TILE_SIZE];
+    // First loop: process tiles that contain exclusions.
-    do {
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        // Extract the coordinates of this tile
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const unsigned int tbx = threadIdx.x - tgx;
+        const ushort2 tileIndices = exclusionTiles[pos];
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        const unsigned int x = tileIndices.x;
-        unsigned int x, y;
+        const unsigned int y = tileIndices.y;
        AtomData4 data;
-        if (pos < end) {
+        data.force = make_real3(0);
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        unsigned int atom1 = x*TILE_SIZE + tgx;
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+        uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
-                y += (x < y ? -1 : 1);
+        unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x == y) {
+            // This tile is on the diagonal.
+            localData[threadIdx.x].pos = data.pos;
+            localData[threadIdx.x].q = data.q;
+            localData[threadIdx.x].dipole = data.dipole;
+            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
+            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
+            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
+            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
+            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+            localData[threadIdx.x].quadrupoleZZ = data.quadrupoleZZ;
+            localData[threadIdx.x].inducedDipole = data.inducedDipole;
+            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
+            localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
+            localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
+            localData[threadIdx.x].thole = data.thole;
+            localData[threadIdx.x].damp = data.damp;
+            // Compute forces.
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+j;
+                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 tempForce;
+                    real tempEnergy;
+                    float d = computeDScaleFactor(polarizationGroup, j);
+                    float p = computePScaleFactor(covalent, polarizationGroup, j);
+                    computeOneEDiffInteractionF1(data, localData[tbx+j], d, p, tempEnergy, tempForce);
+                    energy += 0.25f*tempEnergy;
+                    data.force += tempForce;
+                }
            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
+            data.force *= ENERGY_SCALE_FACTOR;
-            loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-            data.force = make_real3(0);
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-            // Locate the exclusion data for this tile.
-            if (tgx < 2)
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-                localData[threadIdx.x].pos = data.pos;
+            // Compute torques.
-                localData[threadIdx.x].q = data.q;
-                localData[threadIdx.x].dipole = data.dipole;
+            data.force = make_real3(0);
-                localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
+                int atom2 = y*TILE_SIZE+j;
-                localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
+                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
+                    real3 tempTorque;
-                localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+                    float d = computeDScaleFactor(polarizationGroup, j);
-                localData[threadIdx.x].quadrupoleZZ = data.quadrupoleZZ;
+                    float p = computePScaleFactor(covalent, polarizationGroup, j);
-                localData[threadIdx.x].inducedDipole = data.inducedDipole;
+                    computeOneEDiffInteractionT1(data, localData[tbx+j], d, p, tempTorque);
-                localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
+                    data.force += tempTorque;
-                localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
-                localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
-                localData[threadIdx.x].thole = data.thole;
-                localData[threadIdx.x].damp = data.damp;
-                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
-                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
-                // Compute forces.
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = y*TILE_SIZE+j;
-                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 tempForce;
-                        real tempEnergy;
-                        float d = computeDScaleFactor(polarizationGroup, j);
-                        float p = computePScaleFactor(covalent, polarizationGroup, j);
-                        computeOneEDiffInteractionF1(data, localData[tbx+j], d, p, tempEnergy, tempForce);
-                        energy += 0.25f*tempEnergy;
-                        data.force += tempForce;
-                    }
                }
-                data.force *= ENERGY_SCALE_FACTOR;
+            }
-                atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            data.force *= ENERGY_SCALE_FACTOR;
-                atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+        }
+        else {
+            // This is an off-diagonal tile.
+            unsigned int j = y*TILE_SIZE + tgx;
+            loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
+            localData[threadIdx.x].force = make_real3(0);
+            // Compute forces.
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+tj;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 tempForce;
+                    real tempEnergy;
+                    float d = computeDScaleFactor(polarizationGroup, tj);
+                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
+                    computeOneEDiffInteractionF1(data, localData[tbx+tj], d, p, tempEnergy, tempForce);
+                    energy += 0.5f*tempEnergy;
+                    data.force += tempForce;
+                    localData[tbx+tj].force -= tempForce;
+                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+            data.force *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            // Compute torques.
+            data.force = make_real3(0);
-                // Compute torques.
+            localData[threadIdx.x].force = make_real3(0);
+            for (j = 0; j < TILE_SIZE; j++) {
-                data.force = make_real3(0);
+                int atom2 = y*TILE_SIZE+tj;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    int atom2 = y*TILE_SIZE+j;
+                    real3 tempTorque;
-                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    float d = computeDScaleFactor(polarizationGroup, tj);
-                        real3 tempTorque;
+                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
-                        float d = computeDScaleFactor(polarizationGroup, j);
+                    computeOneEDiffInteractionT1(data, localData[tbx+tj], d, p, tempTorque);
-                        float p = computePScaleFactor(covalent, polarizationGroup, j);
+                    data.force += tempTorque;
-                        computeOneEDiffInteractionT1(data, localData[tbx+j], d, p, tempTorque);
+                    computeOneEDiffInteractionT3(data, localData[tbx+tj], d, p, tempTorque);
-                        data.force += tempTorque;
+                    localData[tbx+tj].force += tempTorque;
-                    }
                }
-                data.force *= ENERGY_SCALE_FACTOR;
+                tj = (tj + 1) & (TILE_SIZE - 1);
-                atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
            }
-            else {
+            data.force *= ENERGY_SCALE_FACTOR;
-                // This is an off-diagonal tile.
+            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            offset = x*TILE_SIZE + tgx;
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+        }
+    }
-                unsigned int j = y*TILE_SIZE + tgx;
+    // Second loop: tiles without exclusions (by enumerating all of them, since there's no cutoff).
-                loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
-                localData[threadIdx.x].force = make_real3(0);
-                uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
-                unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
-                // Compute forces.
+    const unsigned int numTiles = numTileIndices;
+    int pos = startTileIndex+warp*numTiles/totalWarps;
+    int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int skipTiles[EDIFF_THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
-                unsigned int tj = tgx;
+    while (pos < end) {
-                for (j = 0; j < TILE_SIZE; j++) {
+        // Extract the coordinates of this tile.
-                    int atom2 = y*TILE_SIZE+tj;
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 tempForce;
-                        real tempEnergy;
-                        float d = computeDScaleFactor(polarizationGroup, tj);
-                        float p = computePScaleFactor(covalent, polarizationGroup, tj);
-                        computeOneEDiffInteractionF1(data, localData[tbx+tj], d, p, tempEnergy, tempForce);
-                        energy += 0.5f*tempEnergy;
-                        data.force += tempForce;
-                        localData[tbx+tj].force -= tempForce;
-                    }
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
-                data.force *= ENERGY_SCALE_FACTOR;
-                localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-                if (pos < end) {
-                    unsigned int offset = x*TILE_SIZE + tgx;
-                    atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                    atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                    atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                    offset = y*TILE_SIZE + tgx;
-                    atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                    atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                    atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                }
-                // Compute torques.
+        unsigned int x, y;
+        y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            y += (x < y ? -1 : 1);
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        }
-                data.force = make_real3(0);
+        // Skip over tiles that have exclusions, since they were already processed.
-                localData[threadIdx.x].force = make_real3(0);
-                for (j = 0; j < TILE_SIZE; j++) {
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-                    int atom2 = y*TILE_SIZE+tj;
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                ushort2 tile = exclusionTiles[skipBase+tgx];
-                        real3 tempTorque;
+                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                        float d = computeDScaleFactor(polarizationGroup, tj);
+            }
-                        float p = computePScaleFactor(covalent, polarizationGroup, tj);
+            else
-                        computeOneEDiffInteractionT1(data, localData[tbx+tj], d, p, tempTorque);
+                skipTiles[threadIdx.x] = end;
-                        data.force += tempTorque;
+            skipBase += TILE_SIZE;            
-                        computeOneEDiffInteractionT3(data, localData[tbx+tj], d, p, tempTorque);
+            currentSkipIndex = tbx;
-                        localData[tbx+tj].force += tempTorque;
+        }
-                    }
+        while (skipTiles[currentSkipIndex] < pos)
-                    tj = (tj + 1) & (TILE_SIZE - 1);
+            currentSkipIndex++;
+        bool includeTile = (skipTiles[currentSkipIndex] != pos);
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            AtomData4 data;
+            data.force = make_real3(0);
+            loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
+            loadAtomData4(localData[threadIdx.x], atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
+            unsigned int j = y*TILE_SIZE + tgx;
+            loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
+            localData[threadIdx.x].force = make_real3(0);
+            // Compute forces.
+            unsigned int tj = tgx;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+tj;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 tempForce;
+                    real tempEnergy;
+                    computeOneEDiffInteractionF1(data, localData[tbx+tj], 1, 1, tempEnergy, tempForce);
+                    energy += 0.5f*tempEnergy;
+                    data.force += tempForce;
+                    localData[tbx+tj].force -= tempForce;
                }
-                data.force *= ENERGY_SCALE_FACTOR;
+                tj = (tj + 1) & (TILE_SIZE - 1);
-                localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            }
-                if (pos < end) {
+            data.force *= ENERGY_SCALE_FACTOR;
-                    unsigned int offset = x*TILE_SIZE + tgx;
+            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-                    atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            unsigned int offset = x*TILE_SIZE + tgx;
-                    atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                    atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                    offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                    atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            offset = y*TILE_SIZE + tgx;
-                    atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                    atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            // Compute torques.
+            data.force = make_real3(0);
+            localData[threadIdx.x].force = make_real3(0);
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+tj;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 tempTorque;
+                    computeOneEDiffInteractionT1(data, localData[tbx+tj], 1, 1, tempTorque);
+                    data.force += tempTorque;
+                    computeOneEDiffInteractionT3(data, localData[tbx+tj], 1, 1, tempTorque);
+                    localData[tbx+tj].force += tempTorque;
                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
            }
+            data.force *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            offset = x*TILE_SIZE + tgx;
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
        }
        pos++;
-    } while (pos < end);
+    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
-#define TILE_SIZE 32
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
 typedef struct {
@@ -59,331 +58,286 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
 */
 extern "C" __global__ void computeElectrostatics(
        unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
-        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
-        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
+        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
 #endif
        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    const unsigned int tbx = threadIdx.x - tgx;
+    real energy = 0;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        AtomData data;
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+        data.force = make_real3(0);
+        uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
+        unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
+        if (x == y) {
+            // This tile is on the diagonal.
+            localData[threadIdx.x].posq = data.posq;
+            localData[threadIdx.x].dipole = data.dipole;
+            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
+            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
+            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
+            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
+            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+            localData[threadIdx.x].inducedDipole = data.inducedDipole;
+            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
+            localData[threadIdx.x].thole = data.thole;
+            localData[threadIdx.x].damp = data.damp;
+            // Compute forces.
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+j;
+                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 tempForce;
+                    real tempEnergy;
+                    float d = computeDScaleFactor(polarizationGroup, j);
+                    float p = computePScaleFactor(covalent, polarizationGroup, j);
+                    float m = computeMScaleFactor(covalent, j);
+                    computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
+                    data.force += tempForce;
+                    energy += 0.5f*tempEnergy;
+                }
+            }
+            data.force *= ENERGY_SCALE_FACTOR;
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            // Compute torques.
+            data.force = make_real3(0);
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+j;
+                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 tempForce;
+                    float d = computeDScaleFactor(polarizationGroup, j);
+                    float p = computePScaleFactor(covalent, polarizationGroup, j);
+                    float m = computeMScaleFactor(covalent, j);
+                    computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
+                    data.force += tempForce;
+                }
+            }
+            data.force *= ENERGY_SCALE_FACTOR;
+            atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+        }
+        else {
+            // This is an off-diagonal tile.
+            unsigned int j = y*TILE_SIZE + tgx;
+            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            localData[threadIdx.x].force = make_real3(0);
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+tj;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 tempForce;
+                    real tempEnergy;
+                    float d = computeDScaleFactor(polarizationGroup, tj);
+                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
+                    float m = computeMScaleFactor(covalent, tj);
+                    computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
+                    data.force += tempForce;
+                    localData[tbx+tj].force -= tempForce;
+                    energy += tempEnergy;
+                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+            data.force *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            // Compute torques.
+            data.force = make_real3(0);
+            localData[threadIdx.x].force = make_real3(0);
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+tj;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 tempForce;
+                    float d = computeDScaleFactor(polarizationGroup, tj);
+                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
+                    float m = computeMScaleFactor(covalent, tj);
+                    computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
+                    data.force += tempForce;
+                    computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
+                    localData[tbx+tj].force += tempForce;
+                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+            data.force *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            offset = x*TILE_SIZE + tgx;
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+        }
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
-    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
-    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+    int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
 #else
    const unsigned int numTiles = numTileIndices;
-    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
+    int pos = startTileIndex+warp*numTiles/totalWarps;
-    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+    int end = startTileIndex+(warp+1)*numTiles/totalWarps;
-#endif
-    real energy = 0;
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
-#ifndef ENABLE_SHUFFLE
-    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
 #endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
-    do {
+    while (pos < end) {
-        // Extract the coordinates of this tile
+        bool includeTile = true;
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-        const unsigned int tbx = threadIdx.x - tgx;
+        // Extract the coordinates of this tile.
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
        unsigned int x, y;
-        AtomData data;
-        if (pos < end) {
 #ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
+        if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
+            ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
+            x = tileIndices.x;
-                y = tileIndices.y;
+        }
-            }
+        else
-            else
 #endif
-            {
+        {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            }
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            AtomData data;
            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            data.force = make_real3(0);
+#ifdef USE_CUTOFF
-            // Locate the exclusion data for this tile.
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[threadIdx.x] = j;
+            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            localData[threadIdx.x].force = make_real3(0);
-            if (tgx < 2)
+            // Compute forces.
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-                localData[threadIdx.x].posq = data.posq;
+            unsigned int tj = tgx;
-                localData[threadIdx.x].dipole = data.dipole;
+            for (j = 0; j < TILE_SIZE; j++) {
-                localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
+                int atom2 = atomIndices[tbx+tj];
-                localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
+                    real3 tempForce;
-                localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
+                    real tempEnergy;
-                localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+                    computeOneInteractionF1(data, localData[tbx+tj], 1, 1, 1, tempEnergy, tempForce);
-                localData[threadIdx.x].inducedDipole = data.inducedDipole;
+                    data.force += tempForce;
-                localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
+                    localData[tbx+tj].force -= tempForce;
-                localData[threadIdx.x].thole = data.thole;
+                    energy += tempEnergy;
-                localData[threadIdx.x].damp = data.damp;
-                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
-                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
-                // Compute forces.
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = y*TILE_SIZE+j;
-                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 tempForce;
-                        real tempEnergy;
-                        float d = computeDScaleFactor(polarizationGroup, j);
-                        float p = computePScaleFactor(covalent, polarizationGroup, j);
-                        float m = computeMScaleFactor(covalent, j);
-                        computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
-                        data.force += tempForce;
-                        energy += 0.5f*tempEnergy;
-                    }
                }
-                data.force *= ENERGY_SCALE_FACTOR;
+                tj = (tj + 1) & (TILE_SIZE - 1);
-                atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                // Compute torques.
-                data.force = make_real3(0);
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = y*TILE_SIZE+j;
-                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 tempForce;
-                        float d = computeDScaleFactor(polarizationGroup, j);
-                        float p = computePScaleFactor(covalent, polarizationGroup, j);
-                        float m = computeMScaleFactor(covalent, j);
-                        computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
-                        data.force += tempForce;
-                    }
-                }
-                data.force *= ENERGY_SCALE_FACTOR;
-                atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
            }
-            else {
+            data.force *= ENERGY_SCALE_FACTOR;
-                // This is an off-diagonal tile.
+            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            unsigned int offset = x*TILE_SIZE + tgx;
-                unsigned int j = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                localData[threadIdx.x].force = make_real3(0);
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
 #ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+            offset = atomIndices[threadIdx.x];
-                if (!hasExclusions && flags != 0xFFFFFFFF) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                int atom2 = tbx+j;
-                                real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
-#ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                                real3 tempForce;
-                                real tempEnergy;
-                                computeOneInteractionF1(data, localData[atom2], 1, 1, 1, tempEnergy, tempForce);
-                                data.force += tempForce;
-                                localData[atom2].force -= tempForce;
-                                energy += tempEnergy;
-                                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#ifdef ENABLE_SHUFFLE
-                                    for (int i = 16; i >= 1; i /= 2) {
-                                        tempForce.x += __shfl_xor(tempForce.x, i, 32);
-                                        tempForce.y += __shfl_xor(tempForce.y, i, 32);
-                                        tempForce.z += __shfl_xor(tempForce.z, i, 32);
-                                    }
-                                    if (tgx == 0)
-                                        localData[atom2].force -= tempForce;
 #else
-                                    int bufferIndex = 3*threadIdx.x;
+            offset = y*TILE_SIZE + tgx;
-                                    tempBuffer[bufferIndex] = tempForce.x;
-                                    tempBuffer[bufferIndex+1] = tempForce.y;
-                                    tempBuffer[bufferIndex+2] = tempForce.z;
-                                    if (tgx % 4 == 0) {
-                                        tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
-                                        tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
-                                        tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].force.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                        localData[atom2].force.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                        localData[atom2].force.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                    }
 #endif
-                                }
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                            }
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                        }
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                        data.force *= ENERGY_SCALE_FACTOR;
-                        localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-                        if (pos < end) {
-                            unsigned int offset = x*TILE_SIZE + tgx;
-                            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                            offset = y*TILE_SIZE + tgx;
-                            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                        }
-                        // Compute torques.
-                        data.force = make_real3(0);
+            // Compute torques.
-                        localData[threadIdx.x].force = make_real3(0);
-                        for (j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                int atom2 = tbx+j;
-                                real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
-#ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                                real3 tempForce;
-                                computeOneInteractionT1(data, localData[atom2], 1, 1, 1, tempForce);
-                                data.force += tempForce;
-                                computeOneInteractionT3(data, localData[atom2], 1, 1, 1, tempForce);
-                                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#ifdef ENABLE_SHUFFLE
-                                    for (int i = 16; i >= 1; i /= 2) {
-                                        tempForce.x += __shfl_xor(tempForce.x, i, 32);
-                                        tempForce.y += __shfl_xor(tempForce.y, i, 32);
-                                        tempForce.z += __shfl_xor(tempForce.z, i, 32);
-                                    }
-                                    if (tgx == 0)
-                                        localData[atom2].force -= tempForce;
-#else
-                                    int bufferIndex = 3*threadIdx.x;
-                                    tempBuffer[bufferIndex] = tempForce.x;
-                                    tempBuffer[bufferIndex+1] = tempForce.y;
-                                    tempBuffer[bufferIndex+2] = tempForce.z;
-                                    if (tgx % 4 == 0) {
-                                        tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
-                                        tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
-                                        tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].force.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                        localData[atom2].force.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                        localData[atom2].force.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                    }
-#endif
-                                }
-                            }
-                        }
-                        data.force *= ENERGY_SCALE_FACTOR;
-                        localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-                        if (pos < end) {
-                            unsigned int offset = x*TILE_SIZE + tgx;
-                            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                            offset = y*TILE_SIZE + tgx;
-                            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                        }
-                    }
-                }
-                else
-#endif
-                {
-                    // Compute the full set of interactions in this tile.
-                    uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
+            data.force = make_real3(0);
-                    unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
+            localData[threadIdx.x].force = make_real3(0);
+            for (j = 0; j < TILE_SIZE; j++) {
-                    // Compute forces.
+                int atom2 = y*TILE_SIZE+tj;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    unsigned int tj = tgx;
+                    real3 tempForce;
-                    for (j = 0; j < TILE_SIZE; j++) {
+                    computeOneInteractionT1(data, localData[tbx+tj], 1, 1, 1, tempForce);
-                        int atom2 = y*TILE_SIZE+tj;
+                    data.force += tempForce;
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    computeOneInteractionT3(data, localData[tbx+tj], 1, 1, 1, tempForce);
-                            real3 tempForce;
+                    localData[tbx+tj].force += tempForce;
-                            real tempEnergy;
-                            float d = computeDScaleFactor(polarizationGroup, tj);
-                            float p = computePScaleFactor(covalent, polarizationGroup, tj);
-                            float m = computeMScaleFactor(covalent, tj);
-                            computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
-                            data.force += tempForce;
-                            localData[tbx+tj].force -= tempForce;
-                            energy += tempEnergy;
-                        }
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                    data.force *= ENERGY_SCALE_FACTOR;
-                    localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-                    if (pos < end) {
-                        unsigned int offset = x*TILE_SIZE + tgx;
-                        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                        offset = y*TILE_SIZE + tgx;
-                        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                    }
-                    // Compute torques.
-                    data.force = make_real3(0);
-                    localData[threadIdx.x].force = make_real3(0);
-                    for (j = 0; j < TILE_SIZE; j++) {
-                        int atom2 = y*TILE_SIZE+tj;
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            real3 tempForce;
-                            float d = computeDScaleFactor(polarizationGroup, tj);
-                            float p = computePScaleFactor(covalent, polarizationGroup, tj);
-                            float m = computeMScaleFactor(covalent, tj);
-                            computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
-                            data.force += tempForce;
-                            computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
-                            localData[tbx+tj].force += tempForce;
-                        }
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                    data.force *= ENERGY_SCALE_FACTOR;
-                    localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-                    if (pos < end) {
-                        unsigned int offset = x*TILE_SIZE + tgx;
-                        atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                        offset = y*TILE_SIZE + tgx;
-                        atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                    }
                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
            }
+            data.force *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            offset = x*TILE_SIZE + tgx;
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+#ifdef USE_CUTOFF
+            offset = atomIndices[threadIdx.x];
+#else
+            offset = y*TILE_SIZE + tgx;
+#endif
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
        }
        pos++;
-    } while (pos < end);
+    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
-#define TILE_SIZE 32
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
 typedef struct {
@@ -398,245 +397,268 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
 */
 extern "C" __global__ void computeFixedField(
        unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
-        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, const ushort2* __restrict__ exclusionTiles,
-        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
+        unsigned int startTileIndex, unsigned int numTileIndices,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
 #elif defined USE_GK
        const real* __restrict__ bornRadii, unsigned long long* __restrict__ gkFieldBuffers,
 #endif
        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    const unsigned int numTiles = interactionCount[0];
+    const unsigned int tbx = threadIdx.x - tgx;
-    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
-    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
-#else
-    const unsigned int numTiles = numTileIndices;
-    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
-    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
-#endif
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+    // First loop: process tiles that contain exclusions.
-#ifndef ENABLE_SHUFFLE
-    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
-#endif
-    do {
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        // Extract the coordinates of this tile
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const unsigned int tbx = threadIdx.x - tgx;
+        const ushort2 tileIndices = exclusionTiles[pos];
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        const unsigned int x = tileIndices.x;
-        unsigned int x, y;
+        const unsigned int y = tileIndices.y;
        AtomData data;
        data.field = make_real3(0);
        data.fieldPolar = make_real3(0);
 #ifdef USE_GK
        data.gkField = make_real3(0);
 #endif
-        if (pos < end) {
+        unsigned int atom1 = x*TILE_SIZE + tgx;
-#ifdef USE_CUTOFF
+        loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
-            if (numTiles <= maxTiles) {
+#ifdef USE_GK
-                ushort2 tileIndices = tiles[pos];
+        data.bornRadius = bornRadii[atom1];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
 #endif
-            {
+        uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x == y) {
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            // This tile is on the diagonal.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            const unsigned int localAtomIndex = threadIdx.x;
+            localData[localAtomIndex].posq = data.posq;
+            localData[localAtomIndex].dipole = data.dipole;
+            localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
+            localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
+            localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
+            localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
+            localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
+            localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
+            localData[localAtomIndex].thole = data.thole;
+            localData[localAtomIndex].damp = data.damp;
+#ifdef USE_GK
+            localData[localAtomIndex].bornRadius = data.bornRadius;
+#endif
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                real3 delta = trimTo3(localData[tbx+j].posq-data.posq);
+#ifdef USE_PERIODIC
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                int atom2 = y*TILE_SIZE+j;
+                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 fields[4];
+                    float d = computeDScaleFactor(polarizationGroup, j);
+                    float p = computePScaleFactor(covalent, polarizationGroup, j);
+                    computeOneInteraction(data, localData[tbx+j], delta, d, p, fields);
+                    data.field += fields[0];
+                    data.fieldPolar += fields[1];
                }
-            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
 #ifdef USE_GK
-            data.bornRadius = bornRadii[atom1];
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 fields[2];
+                    computeOneGkInteraction(data, localData[tbx+j], delta, fields);
+                    data.gkField += fields[0];
+                }
 #endif
+            }
-            // Locate the exclusion data for this tile.
+        }
+        else {
-            if (tgx < 2)
+            // This is an off-diagonal tile.
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
+            const unsigned int localAtomIndex = threadIdx.x;
-                exclusionIndex[localGroupIndex] = -1;
+            unsigned int j = y*TILE_SIZE + tgx;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+            loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
-                if (exclusionIndices[i] == y)
+            localData[localAtomIndex].field = make_real3(0);
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            localData[localAtomIndex].fieldPolar = make_real3(0);
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-                const unsigned int localAtomIndex = threadIdx.x;
-                localData[localAtomIndex].posq = data.posq;
-                localData[localAtomIndex].dipole = data.dipole;
-                localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
-                localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
-                localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
-                localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
-                localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
-                localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
-                localData[localAtomIndex].thole = data.thole;
-                localData[localAtomIndex].damp = data.damp;
 #ifdef USE_GK
-                localData[localAtomIndex].bornRadius = data.bornRadius;
+            localData[localAtomIndex].bornRadius = bornRadii[j];
+            localData[localAtomIndex].gkField = make_real3(0);
 #endif
-                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
+            unsigned int tj = tgx;
-                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
+            for (j = 0; j < TILE_SIZE; j++) {
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
-                    real3 delta = trimTo3(localData[tbx+j].posq-data.posq);
 #ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                    int atom2 = y*TILE_SIZE+j;
+                int atom2 = y*TILE_SIZE+tj;
-                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 fields[4];
+                    real3 fields[4];
-                        float d = computeDScaleFactor(polarizationGroup, j);
+                    float d = computeDScaleFactor(polarizationGroup, tj);
-                        float p = computePScaleFactor(covalent, polarizationGroup, j);
+                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
-                        computeOneInteraction(data, localData[tbx+j], delta, d, p, fields);
+                    computeOneInteraction(data, localData[tbx+tj], delta, d, p, fields);
-                        data.field += fields[0];
+                    data.field += fields[0];
-                        data.fieldPolar += fields[1];
+                    data.fieldPolar += fields[1];
-                    }
+                    localData[tbx+tj].field += fields[2];
+                    localData[tbx+tj].fieldPolar += fields[3];
 #ifdef USE_GK
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
-                        real3 fields[2];
+                    data.gkField += fields[0];
-                        computeOneGkInteraction(data, localData[tbx+j], delta, fields);
+                    localData[tbx+tj].gkField += fields[1];
-                        data.gkField += fields[0];
-                    }
 #endif
                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
            }
-            else {
+        }
-                // This is an off-diagonal tile.
+        // Write results.
-                const unsigned int localAtomIndex = threadIdx.x;
-                unsigned int j = y*TILE_SIZE + tgx;
+        unsigned int offset = x*TILE_SIZE + tgx;
-                loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
+        atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
-                localData[localAtomIndex].field = make_real3(0);
+        atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
-                localData[localAtomIndex].fieldPolar = make_real3(0);
+        atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
+        atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
+        atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
+        atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
 #ifdef USE_GK
-                localData[localAtomIndex].bornRadius = bornRadii[j];
+        atomicAdd(&gkFieldBuffers[offset], static_cast<unsigned long long>((long long) (data.gkField.x*0x100000000)));
-                localData[localAtomIndex].gkField = make_real3(0);
+        atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.y*0x100000000)));
+        atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.z*0x100000000)));
 #endif
-#ifdef USE_CUTOFF
+        if (x != y) {
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+            offset = y*TILE_SIZE + tgx;
-                if (!hasExclusions && flags == 0) { // TODO: Why doesn't the flags != 0 block work?
+            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
-//                if (!hasExclusions && flags != 0xFFFFFFFF) {
+            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
-                    if (flags == 0) {
+            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
-                        // No interactions in this tile.
+            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
-                    }
+            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
-                    else {
+            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
-                        // Compute only a subset of the interactions in this tile.
+#ifdef USE_GK
+            atomicAdd(&gkFieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.x*0x100000000)));
-                        for (j = 0; j < TILE_SIZE; j++) {
+            atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.y*0x100000000)));
-                            if ((flags&(1<<j)) != 0) {
+            atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.z*0x100000000)));
-                                int atom2 = tbx+j;
-                                real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
-#ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                                real3 fields[4];
+        }
-                                computeOneInteraction(data, localData[atom2], delta, 1, 1, fields);
+    }
-                                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#ifdef ENABLE_SHUFFLE
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
-                                    for (int i = 16; i >= 1; i /= 2) {
+    // of them (no cutoff).
-                                        fields[2].x += __shfl_xor(fields[2].x, i, 32);
-                                        fields[2].y += __shfl_xor(fields[2].y, i, 32);
+#ifdef USE_CUTOFF
-                                        fields[2].z += __shfl_xor(fields[2].z, i, 32);
+    const unsigned int numTiles = interactionCount[0];
-                                        fields[3].x += __shfl_xor(fields[3].x, i, 32);
+    int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
-                                        fields[3].y += __shfl_xor(fields[3].y, i, 32);
+    int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
-                                        fields[3].z += __shfl_xor(fields[3].z, i, 32);
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].field += fields[2];
-                                        localData[atom2].fieldPolar += fields[3];
-                                    }
 #else
-                                    int bufferIndex = 3*threadIdx.x;
+    const unsigned int numTiles = numTileIndices;
-                                    tempBuffer[bufferIndex] = fields[2].x;
+    int pos = startTileIndex+warp*numTiles/totalWarps;
-                                    tempBuffer[bufferIndex+1] = fields[2].y;
+    int end = startTileIndex+(warp+1)*numTiles/totalWarps;
-                                    tempBuffer[bufferIndex+2] = fields[2].z;
-                                    if (tgx % 4 == 0) {
-                                        tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
-                                        tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
-                                        tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].field.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                        localData[atom2].field.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                        localData[atom2].field.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                    }
-                                    tempBuffer[bufferIndex] = fields[3].x;
-                                    tempBuffer[bufferIndex+1] = fields[3].y;
-                                    tempBuffer[bufferIndex+2] = fields[3].z;
-                                    if (tgx % 4 == 0) {
-                                        tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
-                                        tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
-                                        tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].fieldPolar.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                        localData[atom2].fieldPolar.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                        localData[atom2].fieldPolar.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                    }
 #endif
-                                }
+    int skipBase = 0;
-                            }
+    int currentSkipIndex = tbx;
-                        }
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
-                    }
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
+    while (pos < end) {
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
                }
                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            AtomData data;
+            data.field = make_real3(0);
+            data.fieldPolar = make_real3(0);
+#ifdef USE_GK
+            data.gkField = make_real3(0);
+#endif
+            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
+#ifdef USE_GK
+            data.bornRadius = bornRadii[atom1];
+#endif
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
 #endif
-                {
+            atomIndices[threadIdx.x] = j;
-                    // Compute the full set of interactions in this tile.
+            const unsigned int localAtomIndex = threadIdx.x;
+            loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
-                    uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
+            localData[localAtomIndex].field = make_real3(0);
-                    unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
+            localData[localAtomIndex].fieldPolar = make_real3(0);
-                    unsigned int tj = tgx;
+#ifdef USE_GK
-                    for (j = 0; j < TILE_SIZE; j++) {
+            localData[localAtomIndex].bornRadius = bornRadii[j];
-                        real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
+            localData[localAtomIndex].gkField = make_real3(0);
+#endif
+            // Compute the full set of interactions in this tile.
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
 #ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                        int atom2 = y*TILE_SIZE+tj;
+                int atom2 = atomIndices[tbx+tj];
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            real3 fields[4];
+                    real3 fields[4];
-                            float d = computeDScaleFactor(polarizationGroup, tj);
+                    computeOneInteraction(data, localData[tbx+tj], delta, 1, 1, fields);
-                            float p = computePScaleFactor(covalent, polarizationGroup, tj);
+                    data.field += fields[0];
-                            computeOneInteraction(data, localData[tbx+tj], delta, d, p, fields);
+                    data.fieldPolar += fields[1];
-                            data.field += fields[0];
+                    localData[tbx+tj].field += fields[2];
-                            data.fieldPolar += fields[1];
+                    localData[tbx+tj].fieldPolar += fields[3];
-                            localData[tbx+tj].field += fields[2];
-                            localData[tbx+tj].fieldPolar += fields[3];
 #ifdef USE_GK
-                            computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
+                    computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
-                            data.gkField += fields[0];
+                    data.gkField += fields[0];
-                            localData[tbx+tj].gkField += fields[1];
+                    localData[tbx+tj].gkField += fields[1];
 #endif
-                        }
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
            }
-        }
+            // Write results.
-        // Write results.
+            unsigned int offset = x*TILE_SIZE + tgx;
-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
@@ -648,9 +670,11 @@ extern "C" __global__ void computeFixedField(
            atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.y*0x100000000)));
            atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.z*0x100000000)));
 #endif
-        }
+#ifdef USE_CUTOFF
-        if (pos < end && x != y) {
+            offset = atomIndices[threadIdx.x];
-            const unsigned int offset = y*TILE_SIZE + tgx;
+#else
+            offset = y*TILE_SIZE + tgx;
+#endif
            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
@@ -664,5 +688,5 @@ extern "C" __global__ void computeFixedField(
 #endif
        }
        pos++;
-    } while (pos < end);
+    }
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
-#define TILE_SIZE 32
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
 typedef struct {
@@ -199,194 +198,221 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
 * Compute the mutual induced field.
 */
 extern "C" __global__ void computeInducedField(
-        unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq,
+        unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq, const ushort2* __restrict__ exclusionTiles, 
        const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
 #elif defined USE_GK
        unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS,
        const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii,
 #endif
        const float2* __restrict__ dampingAndThole) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    const unsigned int numTiles = interactionCount[0];
+    const unsigned int tbx = threadIdx.x - tgx;
-    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
-    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
-#else
-    const unsigned int numTiles = numTileIndices;
-    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
-    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
-#endif
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-#ifndef ENABLE_SHUFFLE
-//    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
+    // First loop: process tiles that contain exclusions.
-#endif
-    do {
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        // Extract the coordinates of this tile
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        const unsigned int tbx = threadIdx.x - tgx;
+        const ushort2 tileIndices = exclusionTiles[pos];
-        unsigned int x, y;
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
        AtomData data;
        zeroAtomData(data);
-        if (pos < end) {
+        unsigned int atom1 = x*TILE_SIZE + tgx;
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
 #ifdef USE_GK
-            loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
+        loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
 #else
-            loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
+        loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
 #endif
-            if (pos >= end)
+        if (x == y) {
-                ; // This warp is done.
+            // This tile is on the diagonal.
-            else if (x == y) {
-                // This tile is on the diagonal.
+            localData[threadIdx.x].pos = data.pos;
+            localData[threadIdx.x].inducedDipole = data.inducedDipole;
-                localData[threadIdx.x].pos = data.pos;
+            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
-                localData[threadIdx.x].inducedDipole = data.inducedDipole;
+            localData[threadIdx.x].thole = data.thole;
-                localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
+            localData[threadIdx.x].damp = data.damp;
-                localData[threadIdx.x].thole = data.thole;
-                localData[threadIdx.x].damp = data.damp;
 #ifdef USE_GK
-                localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
+            localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
-                localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
+            localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
-                localData[threadIdx.x].bornRadius = data.bornRadius;
+            localData[threadIdx.x].bornRadius = data.bornRadius;
 #endif
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real3 delta = localData[tbx+j].pos-data.pos;
+                real3 delta = localData[tbx+j].pos-data.pos;
 #ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                    int atom2 = y*TILE_SIZE+j;
+                int atom2 = y*TILE_SIZE+j;
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
-                        computeOneInteraction(data, localData[tbx+j], delta, atom1 == atom2);
+                    computeOneInteraction(data, localData[tbx+j], delta, atom1 == atom2);
-                }
            }
-            else {
+        }
-                // This is an off-diagonal tile.
+        else {
+            // This is an off-diagonal tile.
 #ifdef USE_GK
-                loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
+            loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
 #else
-                loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
 #endif
-                zeroAtomData(localData[threadIdx.x]);
+            zeroAtomData(localData[threadIdx.x]);
-#ifdef USE_CUTOFF
+            unsigned int tj = tgx;
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                if (flags == 0) { // TODO: Figure out what the flags != 0 case doesn't work!!!
+                real3 delta = localData[tbx+tj].pos-data.pos;
-//                if (flags != 0xFFFFFFFF) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-/*                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (int j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                int atom2 = tbx+j;
-                                real3 delta = localData[atom2].pos-data.pos;
 #ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                int atom2 = y*TILE_SIZE+j;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
+                    computeOneInteraction(data, localData[tbx+tj], delta, false);
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+        }
+        // Write results.
+        unsigned int offset = x*TILE_SIZE + tgx;
+        atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
+        atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
+        atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
+        atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
+        atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
+        atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
+#ifdef USE_GK
+        atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
+        atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
+        atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
+        atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
+        atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
+        atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
+#endif
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
+            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
+            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
+            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
+            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
+            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
+#ifdef USE_GK
+            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.x*0x100000000)));
+            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.y*0x100000000)));
+            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.z*0x100000000)));
+            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.x*0x100000000)));
+            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.y*0x100000000)));
+            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.z*0x100000000)));
 #endif
-                                real3 fields[4];
+        }
-                                computeOneInteraction(data, localData[atom2], delta, fields);
+    }
-                                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                                    data.field += fields[0];
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
-                                    data.fieldPolar += fields[1];
+    // of them (no cutoff).
-#ifdef ENABLE_SHUFFLE
-                                    for (int i = 16; i >= 1; i /= 2) {
+#ifdef USE_CUTOFF
-                                        fields[2].x += __shfl_xor(fields[2].x, i, 32);
+    const unsigned int numTiles = interactionCount[0];
-                                        fields[2].y += __shfl_xor(fields[2].y, i, 32);
+    int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
-                                        fields[2].z += __shfl_xor(fields[2].z, i, 32);
+    int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
-                                        fields[3].x += __shfl_xor(fields[3].x, i, 32);
-                                        fields[3].y += __shfl_xor(fields[3].y, i, 32);
-                                        fields[3].z += __shfl_xor(fields[3].z, i, 32);
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].field += fields[2];
-                                        localData[atom2].fieldPolar += fields[3];
-                                    }
 #else
-                                    int bufferIndex = 3*threadIdx.x;
+    const unsigned int numTiles = numTileIndices;
-                                    tempBuffer[bufferIndex] = fields[2].x;
+    int pos = startTileIndex+warp*numTiles/totalWarps;
-                                    tempBuffer[bufferIndex+1] = fields[2].y;
+    int end = startTileIndex+(warp+1)*numTiles/totalWarps;
-                                    tempBuffer[bufferIndex+2] = fields[2].z;
+#endif
-                                    if (tgx % 4 == 0) {
+    int skipBase = 0;
-                                        tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
+    int currentSkipIndex = tbx;
-                                        tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
-                                        tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
-                                    }
+    skipTiles[threadIdx.x] = -1;
-                                    if (tgx == 0) {
-                                        localData[atom2].field.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
+    while (pos < end) {
-                                        localData[atom2].field.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
+        bool includeTile = true;
-                                        localData[atom2].field.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                    }
+        // Extract the coordinates of this tile.
-                                    tempBuffer[bufferIndex] = fields[3].x;
-                                    tempBuffer[bufferIndex+1] = fields[3].y;
+        unsigned int x, y;
-                                    tempBuffer[bufferIndex+2] = fields[3].z;
+#ifdef USE_CUTOFF
-                                    if (tgx % 4 == 0) {
+        if (numTiles <= maxTiles) {
-                                        tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
+            ushort2 tileIndices = tiles[pos];
-                                        tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
+            x = tileIndices.x;
-                                        tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
+        }
-                                    }
+        else
-                                    if (tgx == 0) {
-                                        localData[atom2].fieldPolar.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                        localData[atom2].fieldPolar.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                        localData[atom2].fieldPolar.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                    }
 #endif
-                                }
+        {
-                            }
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                        }
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                    }*/
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
                }
                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            AtomData data;
+            zeroAtomData(data);
+#ifdef USE_GK
+            loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
+#else
+            loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
+#endif
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
 #endif
-                {
+            atomIndices[threadIdx.x] = j;
-                    // Compute the full set of interactions in this tile.
+#ifdef USE_GK
+            loadAtomData(localData[threadIdx.x], j, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
+#else
+            loadAtomData(localData[threadIdx.x], j, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
+#endif
+            zeroAtomData(localData[threadIdx.x]);
-                    unsigned int tj = tgx;
+            // Compute the full set of interactions in this tile.
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real3 delta = localData[tbx+tj].pos-data.pos;
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                real3 delta = localData[tbx+tj].pos-data.pos;
 #ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                        int atom2 = y*TILE_SIZE+j;
+                int atom2 = atomIndices[tbx+tj];
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
-                            computeOneInteraction(data, localData[tbx+tj], delta, false);
+                    computeOneInteraction(data, localData[tbx+tj], delta, false);
-                        tj = (tj + 1) & (TILE_SIZE - 1);
+                tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                }
            }
-        }
+            // Write results.
-        // Write results.
+            unsigned int offset = x*TILE_SIZE + tgx;
-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
@@ -401,9 +427,11 @@ extern "C" __global__ void computeInducedField(
            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
 #endif
-        }
+#ifdef USE_CUTOFF
-        if (pos < end && x != y) {
+            offset = atomIndices[threadIdx.x];
-            const unsigned int offset = y*TILE_SIZE + tgx;
+#else
+            offset = y*TILE_SIZE + tgx;
+#endif
            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
@@ -420,7 +448,7 @@ extern "C" __global__ void computeInducedField(
 #endif
        }
        pos++;
-    } while (pos < end);
+    }
 }
 extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar,

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
 #define ARRAY(x,y) array[(x)-1+((y)-1)*PME_ORDER]
 /**
- * This is called from updateBsplines().  It calculates the spline coefficients for a single atom along a single axis.
+ * Calculate the spline coefficients for a single atom along a single axis.
 */
 __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
    // initialization to get to 2nd order recursion
@@ -70,15 +70,10 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
 }
 /**
- * Compute bspline coefficients.
+ * Compute the index of the grid point each atom is associated with.
 */
-extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4* __restrict__ igrid, int2* __restrict__ pmeAtomGridIndex,
+extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
-        real4* __restrict__ theta1, real4* __restrict__ theta2, real4* __restrict__ theta3, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    extern __shared__ real bsplines_cache[]; // size = block_size*pme_order*pme_order
-    real* array = &bsplines_cache[threadIdx.x*PME_ORDER*PME_ORDER];
-    //  get the B-spline coefficients for each multipole site
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 pos = posq[i];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -90,256 +85,226 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4*
        real w = pos.x*invPeriodicBoxSize.x;
        real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
        int ifr = (int) fr;
-        w = fr - ifr;
        int igrid1 = ifr-PME_ORDER+1;
-        computeBSplinePoint(&theta1[i*PME_ORDER], w, array);
        // Second axis.
        w = pos.y*invPeriodicBoxSize.y;
        fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
        ifr = (int) fr;
-        w = fr - ifr;
        int igrid2 = ifr-PME_ORDER+1;
-        computeBSplinePoint(&theta2[i*PME_ORDER], w, array);
        // Third axis.
        w = pos.z*invPeriodicBoxSize.z;
        fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
        ifr = (int) fr;
-        w = fr - ifr;
        int igrid3 = ifr-PME_ORDER+1;
-        computeBSplinePoint(&theta3[i*PME_ORDER], w, array);
        // Record the grid point.
        igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
        igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
        igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
-        igrid[i] = make_int4(igrid1, igrid2, igrid3, 0);
        pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3);
    }
 }
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
-        const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int thread = blockIdx.x*blockDim.x+threadIdx.x;
-    int start = (NUM_ATOMS*thread)/(blockDim.x*gridDim.x);
-    int end = (NUM_ATOMS*(thread+1))/(blockDim.x*gridDim.x);
-    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i) {
-        int2 atomData = pmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last) {
-            for (int j = last+1; j <= gridIndex; ++j)
-                pmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-    }
-    // Fill in values beyond the last atom.
-    if (thread == blockDim.x*gridDim.x-1) {
-        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-        for (int j = last+1; j <= gridSize; ++j)
-            pmeAtomRange[j] = NUM_ATOMS;
-    }
-}
-/**
- * The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
- * some work in the charge spreading kernel.
- */
-extern "C" __global__ void recordZIndex(int2* __restrict__ pmeAtomGridIndex, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int thread = blockIdx.x*blockDim.x+threadIdx.x;
-    int start = (NUM_ATOMS*thread)/(blockDim.x*gridDim.x);
-    int end = (NUM_ATOMS*(thread+1))/(blockDim.x*gridDim.x);
-    for (int i = start; i < end; ++i) {
-        real posz = posq[pmeAtomGridIndex[i].x].z;
-        posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        real w = posz*invPeriodicBoxSize.z;
-        real fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
-        int z = ((int) fr)-PME_ORDER+1;
-        pmeAtomGridIndex[i].y = z;
-    }
-}
 extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
-        const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
+        const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
-        const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
    const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
    const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
-    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    real array[PME_ORDER*PME_ORDER];
-    unsigned int numThreads = gridDim.x*blockDim.x;
+    real4 theta1[PME_ORDER];
-    for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) {
+    real4 theta2[PME_ORDER];
-        int3 gridPoint;
+    real4 theta3[PME_ORDER];
-        gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
-        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
-        gridPoint.y = remainder/GRID_SIZE_Z;
+    // the grid values.
-        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
-        real result = 0;
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        for (int ix = 0; ix < PME_ORDER; ++ix) {
+        int m = pmeAtomGridIndex[i].x;
-            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
+        real4 pos = posq[m];
-            for (int iy = 0; iy < PME_ORDER; ++iy) {
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-                int z1 = gridPoint.z-PME_ORDER+1;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-                z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
-                int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
+        // Since we need the full set of thetas, it's faster to compute them here than load them
-                int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
+        // from global memory.
-                int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
-                int firstAtom = pmeAtomRange[gridIndex1];
+        real w = pos.x*invPeriodicBoxSize.x;
-                int lastAtom = pmeAtomRange[gridIndex2+1];
+        real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
-                for (int i = firstAtom; i < lastAtom; ++i) {
+        int ifr = (int) fr;
-                    int2 atomData = pmeAtomGridIndex[i];
+        w = fr - ifr;
-                    int atomIndex = atomData.x;
+        int igrid1 = ifr-PME_ORDER+1;
-                    int z = atomData.y;
+        computeBSplinePoint(theta1, w, array);
-                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
+        w = pos.y*invPeriodicBoxSize.y;
-                    if (iz >= GRID_SIZE_Z)
+        fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
-                        iz -= GRID_SIZE_Z;
+        ifr = (int) fr;
-                    real atomCharge = posq[atomIndex].w;
+        w = fr - ifr;
-                    real atomDipoleX = xscale*labFrameDipole[atomIndex*3];
+        int igrid2 = ifr-PME_ORDER+1;
-                    real atomDipoleY = yscale*labFrameDipole[atomIndex*3+1];
+        computeBSplinePoint(theta2, w, array);
-                    real atomDipoleZ = zscale*labFrameDipole[atomIndex*3+2];
+        w = pos.z*invPeriodicBoxSize.z;
-                    real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[atomIndex*5];
+        fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
-                    real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[atomIndex*5+1];
+        ifr = (int) fr;
-                    real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[atomIndex*5+2];
+        w = fr - ifr;
-                    real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[atomIndex*5+3];
+        int igrid3 = ifr-PME_ORDER+1;
-                    real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[atomIndex*5+4];
+        computeBSplinePoint(theta3, w, array);
-                    real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[atomIndex*5]+labFrameQuadrupole[atomIndex*5+3]);
+        igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
-                    real4 t = theta1[atomIndex*PME_ORDER+ix];
+        igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
-                    real4 u = theta2[atomIndex*PME_ORDER+iy];
+        igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
-                    real4 v = theta3[atomIndex*PME_ORDER+iz];
+        // Spread the charge from this atom onto each grid point.
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = igrid1+ix;
+            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
+            real4 t = theta1[ix];
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = igrid2+iy;
+                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*GRID_SIZE_Z;
+                real4 u = theta2[iy];
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = igrid3+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;
+                    real4 v = theta3[iz];
+                    real atomCharge = pos.w;
+                    real atomDipoleX = xscale*labFrameDipole[m*3];
+                    real atomDipoleY = yscale*labFrameDipole[m*3+1];
+                    real atomDipoleZ = zscale*labFrameDipole[m*3+2];
+                    real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[m*5];
+                    real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[m*5+1];
+                    real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[m*5+2];
+                    real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[m*5+3];
+                    real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[m*5+4];
+                    real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[m*5]+labFrameQuadrupole[m*5+3]);
                    real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
                    real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
                    real term2 = atomQuadrupoleXX * u.x * v.x;
-                    result += term0*t.x + term1*t.y + term2*t.z;
+                    real add = term0*t.x + term1*t.y + term2*t.z;
-                }
+#ifdef USE_DOUBLE_PRECISION
-                if (z1 > gridPoint.z) {
+                    unsigned long long * ulonglong_p = (unsigned long long *) pmeGrid;
-                    gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
+                    atomicAdd(&ulonglong_p[2*index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
-                    gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
+#else
-                    firstAtom = pmeAtomRange[gridIndex1];
+                    atomicAdd(&pmeGrid[index].x, add);
-                    lastAtom = pmeAtomRange[gridIndex2+1];
+#endif
-                    for (int i = firstAtom; i < lastAtom; ++i) {
-                        int2 atomData = pmeAtomGridIndex[i];
-                        int atomIndex = atomData.x;
-                        int z = atomData.y;
-                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                        if (iz >= GRID_SIZE_Z)
-                            iz -= GRID_SIZE_Z;
-                        real atomCharge = posq[atomIndex].w;
-                        real atomDipoleX = xscale*labFrameDipole[atomIndex*3];
-                        real atomDipoleY = yscale*labFrameDipole[atomIndex*3+1];
-                        real atomDipoleZ = zscale*labFrameDipole[atomIndex*3+2];
-                        real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[atomIndex*5];
-                        real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[atomIndex*5+1];
-                        real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[atomIndex*5+2];
-                        real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[atomIndex*5+3];
-                        real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[atomIndex*5+4];
-                        real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[atomIndex*5]+labFrameQuadrupole[atomIndex*5+3]);
-                        real4 t = theta1[atomIndex*PME_ORDER+ix];
-                        real4 u = theta2[atomIndex*PME_ORDER+iy];
-                        real4 v = theta3[atomIndex*PME_ORDER+iz];
-                        real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
-                        real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
-                        real term2 = atomQuadrupoleXX * u.x * v.x;
-                        result += term0*t.x + term1*t.y + term2*t.z;
-                    }
                }
            }
        }
-        pmeGrid[gridIndex] = make_real2(result, 0);
    }
 }
 extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
-        const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
+        const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
-        const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
    const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
    const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
-    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    real array[PME_ORDER*PME_ORDER];
-    unsigned int numThreads = gridDim.x*blockDim.x;
+    real4 theta1[PME_ORDER];
-    for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) {
+    real4 theta2[PME_ORDER];
-        int3 gridPoint;
+    real4 theta3[PME_ORDER];
-        gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
-        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
-        gridPoint.y = remainder/GRID_SIZE_Z;
+    // the grid values.
-        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
-        real2 result = make_real2(0, 0);
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        for (int ix = 0; ix < PME_ORDER; ++ix) {
+        int m = pmeAtomGridIndex[i].x;
-            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
+        real4 pos = posq[m];
-            for (int iy = 0; iy < PME_ORDER; ++iy) {
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-                int z1 = gridPoint.z-PME_ORDER+1;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-                z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
-                int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
+        // Since we need the full set of thetas, it's faster to compute them here than load them
-                int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
+        // from global memory.
-                int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
-                int firstAtom = pmeAtomRange[gridIndex1];
+        real w = pos.x*invPeriodicBoxSize.x;
-                int lastAtom = pmeAtomRange[gridIndex2+1];
+        real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
-                for (int i = firstAtom; i < lastAtom; ++i) {
+        int ifr = (int) fr;
-                    int2 atomData = pmeAtomGridIndex[i];
+        w = fr - ifr;
-                    int atomIndex = atomData.x;
+        int igrid1 = ifr-PME_ORDER+1;
-                    int z = atomData.y;
+        computeBSplinePoint(theta1, w, array);
-                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
+        w = pos.y*invPeriodicBoxSize.y;
-                    if (iz >= GRID_SIZE_Z)
+        fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
-                        iz -= GRID_SIZE_Z;
+        ifr = (int) fr;
-                    real inducedDipoleX = xscale*inducedDipole[atomIndex*3];
+        w = fr - ifr;
-                    real inducedDipoleY = yscale*inducedDipole[atomIndex*3+1];
+        int igrid2 = ifr-PME_ORDER+1;
-                    real inducedDipoleZ = zscale*inducedDipole[atomIndex*3+2];
+        computeBSplinePoint(theta2, w, array);
-                    real inducedDipolePolarX = xscale*inducedDipolePolar[atomIndex*3];
+        w = pos.z*invPeriodicBoxSize.z;
-                    real inducedDipolePolarY = yscale*inducedDipolePolar[atomIndex*3+1];
+        fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
-                    real inducedDipolePolarZ = zscale*inducedDipolePolar[atomIndex*3+2];
+        ifr = (int) fr;
-                    real4 t = theta1[atomIndex*PME_ORDER+ix];
+        w = fr - ifr;
-                    real4 u = theta2[atomIndex*PME_ORDER+iy];
+        int igrid3 = ifr-PME_ORDER+1;
-                    real4 v = theta3[atomIndex*PME_ORDER+iz];
+        computeBSplinePoint(theta3, w, array);
+        igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
+        igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
+        igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
+        // Spread the charge from this atom onto each grid point.
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = igrid1+ix;
+            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
+            real4 t = theta1[ix];
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = igrid2+iy;
+                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*GRID_SIZE_Z;
+                real4 u = theta2[iy];
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = igrid3+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;
+                    real4 v = theta3[iz];
+                    real inducedDipoleX = xscale*inducedDipole[m*3];
+                    real inducedDipoleY = yscale*inducedDipole[m*3+1];
+                    real inducedDipoleZ = zscale*inducedDipole[m*3+2];
+                    real inducedDipolePolarX = xscale*inducedDipolePolar[m*3];
+                    real inducedDipolePolarY = yscale*inducedDipolePolar[m*3+1];
+                    real inducedDipolePolarZ = zscale*inducedDipolePolar[m*3+2];
                    real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
                    real term11 = inducedDipoleX*u.x*v.x;
                    real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
                    real term12 = inducedDipolePolarX*u.x*v.x;
-                    result.x += term01*t.x + term11*t.y;
+                    real add1 = term01*t.x + term11*t.y;
-                    result.y += term02*t.x + term12*t.y;
+                    real add2 = term02*t.x + term12*t.y;
-                }
+#ifdef USE_DOUBLE_PRECISION
-                if (z1 > gridPoint.z) {
+                    unsigned long long * ulonglong_p = (unsigned long long *) pmeGrid;
-                    gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
+                    atomicAdd(&ulonglong_p[2*index],  static_cast<unsigned long long>((long long) (add1*0x100000000)));
-                    gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
+                    atomicAdd(&ulonglong_p[2*index+1],  static_cast<unsigned long long>((long long) (add2*0x100000000)));
-                    firstAtom = pmeAtomRange[gridIndex1];
+#else
-                    lastAtom = pmeAtomRange[gridIndex2+1];
+                    atomicAdd(&pmeGrid[index].x, add1);
-                    for (int i = firstAtom; i < lastAtom; ++i) {
+                    atomicAdd(&pmeGrid[index].y, add2);
-                        int2 atomData = pmeAtomGridIndex[i];
+#endif
-                        int atomIndex = atomData.x;
-                        int z = atomData.y;
-                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                        if (iz >= GRID_SIZE_Z)
-                            iz -= GRID_SIZE_Z;
-                        real inducedDipoleX = xscale*inducedDipole[atomIndex*3];
-                        real inducedDipoleY = yscale*inducedDipole[atomIndex*3+1];
-                        real inducedDipoleZ = zscale*inducedDipole[atomIndex*3+2];
-                        real inducedDipolePolarX = xscale*inducedDipolePolar[atomIndex*3];
-                        real inducedDipolePolarY = yscale*inducedDipolePolar[atomIndex*3+1];
-                        real inducedDipolePolarZ = zscale*inducedDipolePolar[atomIndex*3+2];
-                        real4 t = theta1[atomIndex*PME_ORDER+ix];
-                        real4 u = theta2[atomIndex*PME_ORDER+iy];
-                        real4 v = theta3[atomIndex*PME_ORDER+iz];
-                        real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
-                        real term11 = inducedDipoleX*u.x*v.x;
-                        real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
-                        real term12 = inducedDipolePolarX*u.x*v.x;
-                        result.x += term01*t.x + term11*t.y;
-                        result.y += term02*t.x + term12*t.y;
-                    }
                }
            }
        }
-        pmeGrid[gridIndex] = result;
    }
 }
+/**
+ * In double precision, we have to use fixed point to accumulate the grid values, so convert them to floating point.
+ */
+extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) {
+    real* floatGrid = (real*) pmeGrid;
+    const unsigned int gridSize = 2*GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    real scale = 1/(real) 0x100000000;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
+        floatGrid[index] = scale*pmeGrid[index];
+}
 extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, const real* __restrict__ pmeBsplineModuliX,
        const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
@@ -372,12 +337,50 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
 }
 extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi,
-        long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const int4* __restrict__ igrid, const real4* __restrict__ theta1,
+        long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers,  const real4* __restrict__ posq,
-        const real4* __restrict__ theta2, const real4* __restrict__ theta3, const real* __restrict__ labFrameDipole, real4 invPeriodicBoxSize) {
+        const real* __restrict__ labFrameDipole, real4 periodicBoxSize, real4 invPeriodicBoxSize, int2* __restrict__ pmeAtomGridIndex) {
-    // extract the permanent multipole field at each site
+    real array[PME_ORDER*PME_ORDER];
+    real4 theta1[PME_ORDER];
+    real4 theta2[PME_ORDER];
+    real4 theta3[PME_ORDER];
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    // the grid values.
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int m = pmeAtomGridIndex[i].x;
+        real4 pos = posq[m];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        real w = pos.x*invPeriodicBoxSize.x;
+        real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
+        int ifr = (int) fr;
+        w = fr - ifr;
+        int igrid1 = ifr-PME_ORDER+1;
+        computeBSplinePoint(theta1, w, array);
+        w = pos.y*invPeriodicBoxSize.y;
+        fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
+        ifr = (int) fr;
+        w = fr - ifr;
+        int igrid2 = ifr-PME_ORDER+1;
+        computeBSplinePoint(theta2, w, array);
+        w = pos.z*invPeriodicBoxSize.z;
+        fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
+        ifr = (int) fr;
+        w = fr - ifr;
+        int igrid3 = ifr-PME_ORDER+1;
+        computeBSplinePoint(theta3, w, array);
+        igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
+        igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
+        igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
+        // Compute the potential from this grid point.
-    for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
-        int4 gridPoint = igrid[m];
        real tuv000 = 0;
        real tuv001 = 0;
        real tuv010 = 0;
@@ -399,8 +402,8 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
        real tuv012 = 0;
        real tuv111 = 0;
        for (int iz = 0; iz < PME_ORDER; iz++) {
-            int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+            int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-            real4 v = theta3[m*PME_ORDER+iz];
+            real4 v = theta3[iz];
            real tu00 = 0;
            real tu10 = 0;
            real tu01 = 0;
@@ -412,14 +415,14 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
            real tu12 = 0;
            real tu03 = 0;
            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                real4 u = theta2[m*PME_ORDER+iy];
+                real4 u = theta2[iy];
                real4 t = make_real4(0, 0, 0, 0);
                for (int ix = 0; ix < PME_ORDER; ix++) {
-                    int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+                    int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
                    int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
                    real tq = pmeGrid[gridIndex].x;
-                    real4 tadd = theta1[m*PME_ORDER+ix];
+                    real4 tadd = theta1[ix];
                    t.x += tq*tadd.x;
                    t.y += tq*tadd.y;
                    t.z += tq*tadd.z;
@@ -491,12 +494,50 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
 }
 extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid,
-        real* __restrict__ phip, real* __restrict__ phidp, const int4* __restrict__ igrid, const real4* __restrict__ theta1,
+        real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq,
-        const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, int2* __restrict__ pmeAtomGridIndex) {
-    // extract the induced dipole field at each site
+    real array[PME_ORDER*PME_ORDER];
+    real4 theta1[PME_ORDER];
+    real4 theta2[PME_ORDER];
+    real4 theta3[PME_ORDER];
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    // the grid values.
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int m = pmeAtomGridIndex[i].x;
+        real4 pos = posq[m];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        real w = pos.x*invPeriodicBoxSize.x;
+        real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
+        int ifr = (int) fr;
+        w = fr - ifr;
+        int igrid1 = ifr-PME_ORDER+1;
+        computeBSplinePoint(theta1, w, array);
+        w = pos.y*invPeriodicBoxSize.y;
+        fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
+        ifr = (int) fr;
+        w = fr - ifr;
+        int igrid2 = ifr-PME_ORDER+1;
+        computeBSplinePoint(theta2, w, array);
+        w = pos.z*invPeriodicBoxSize.z;
+        fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
+        ifr = (int) fr;
+        w = fr - ifr;
+        int igrid3 = ifr-PME_ORDER+1;
+        computeBSplinePoint(theta3, w, array);
+        igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
+        igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
+        igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
+        // Compute the potential from this grid point.
-    for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
-        int4 gridPoint = igrid[m];
        real tuv100_1 = 0;
        real tuv010_1 = 0;
        real tuv001_1 = 0;
@@ -536,8 +577,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
        real tuv012 = 0;
        real tuv111 = 0;
        for (int iz = 0; iz < PME_ORDER; iz++) {
-            int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+            int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-            real4 v = theta3[m*PME_ORDER+iz];
+            real4 v = theta3[iz];
            real tu00_1 = 0;
            real tu01_1 = 0;
            real tu10_1 = 0;
@@ -561,8 +602,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
            real tu12 = 0;
            real tu03 = 0;
            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                real4 u = theta2[m*PME_ORDER+iy];
+                real4 u = theta2[iy];
                real t0_1 = 0;
                real t1_1 = 0;
                real t2_1 = 0;
@@ -571,10 +612,10 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
                real t2_2 = 0;
                real t3 = 0;
                for (int ix = 0; ix < PME_ORDER; ix++) {
-                    int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+                    int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
                    int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
                    real2 tq = pmeGrid[gridIndex];
-                    real4 tadd = theta1[m*PME_ORDER+ix];
+                    real4 tadd = theta1[ix];
                    t0_1 += tq.x*tadd.x;
                    t1_1 += tq.x*tadd.y;
                    t2_1 += tq.x*tadd.z;

--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
-#define TILE_SIZE 32
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
 typedef struct {
@@ -182,253 +181,223 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
 */
 extern "C" __global__ void computeElectrostatics(
        unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
-        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
-        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
+        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
 #endif
        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    const unsigned int tbx = threadIdx.x - tgx;
+    real energy = 0;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        AtomData data;
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+        data.force = make_real3(0);
+        data.torque = make_real3(0);
+        uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
+        unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
+        if (x == y) {
+            // This tile is on the diagonal.
+            localData[threadIdx.x].pos = data.pos;
+            localData[threadIdx.x].q = data.q;
+            localData[threadIdx.x].dipole = data.dipole;
+            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
+            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
+            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
+            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
+            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+            localData[threadIdx.x].inducedDipole = data.inducedDipole;
+            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
+            localData[threadIdx.x].thole = data.thole;
+            localData[threadIdx.x].damp = data.damp;
+            // Compute forces.
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+j;
+                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    float d = computeDScaleFactor(polarizationGroup, j);
+                    float p = computePScaleFactor(covalent, polarizationGroup, j);
+                    float m = computeMScaleFactor(covalent, j);
+                    computeOneInteraction(data, localData[tbx+j], true, d, p, m, 0.5f, energy, periodicBoxSize, invPeriodicBoxSize);
+                }
+            }
+            if (atom1 < NUM_ATOMS)
+                computeSelfEnergyAndTorque(data, energy);
+            data.force *= -ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
+        }
+        else {
+            // This is an off-diagonal tile.
+            unsigned int j = y*TILE_SIZE + tgx;
+            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            localData[threadIdx.x].force = make_real3(0);
+            localData[threadIdx.x].torque = make_real3(0);
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = y*TILE_SIZE+tj;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    float d = computeDScaleFactor(polarizationGroup, tj);
+                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
+                    float m = computeMScaleFactor(covalent, tj);
+                    computeOneInteraction(data, localData[tbx+tj], true, d, p, m, 1, energy, periodicBoxSize, invPeriodicBoxSize);
+                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+            data.force *= -ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
+            unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
+        }
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
-    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
-    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+    int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
 #else
    const unsigned int numTiles = numTileIndices;
-    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
+    int pos = startTileIndex+warp*numTiles/totalWarps;
-    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+    int end = startTileIndex+(warp+1)*numTiles/totalWarps;
-#endif
-    real energy = 0;
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
-#ifndef ENABLE_SHUFFLE
-    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
 #endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
-    do {
+    while (pos < end) {
-        // Extract the coordinates of this tile
+        bool includeTile = true;
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-        const unsigned int tbx = threadIdx.x - tgx;
+        // Extract the coordinates of this tile.
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
        unsigned int x, y;
-        AtomData data;
-        if (pos < end) {
 #ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
+        if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
+            ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
+            x = tileIndices.x;
-                y = tileIndices.y;
+        }
-            }
+        else
-            else
 #endif
-            {
+        {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            }
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            AtomData data;
            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            data.force = make_real3(0);
            data.torque = make_real3(0);
-            // Locate the exclusion data for this tile.
-            if (tgx < 2)
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-                localData[threadIdx.x].pos = data.pos;
-                localData[threadIdx.x].q = data.q;
-                localData[threadIdx.x].dipole = data.dipole;
-                localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
-                localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
-                localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
-                localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
-                localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
-                localData[threadIdx.x].inducedDipole = data.inducedDipole;
-                localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
-                localData[threadIdx.x].thole = data.thole;
-                localData[threadIdx.x].damp = data.damp;
-                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
-                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
-                // Compute forces.
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = y*TILE_SIZE+j;
-                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        float d = computeDScaleFactor(polarizationGroup, j);
-                        float p = computePScaleFactor(covalent, polarizationGroup, j);
-                        float m = computeMScaleFactor(covalent, j);
-                        computeOneInteraction(data, localData[tbx+j], hasExclusions, d, p, m, 0.5f, energy, periodicBoxSize, invPeriodicBoxSize);
-                    }
-                }
-                if (atom1 < NUM_ATOMS)
-                    computeSelfEnergyAndTorque(data, energy);
-                data.force *= -ENERGY_SCALE_FACTOR;
-                data.torque *= ENERGY_SCALE_FACTOR;
-                atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
-                atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
-                atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
-            }
-            else {
-                // This is an off-diagonal tile.
-                unsigned int j = y*TILE_SIZE + tgx;
-                loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
-                localData[threadIdx.x].force = make_real3(0);
-                localData[threadIdx.x].torque = make_real3(0);
 #ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
-                if (!hasExclusions && flags == 0) { // TODO: Why doesn't the flags != 0 block work?
-//                if (!hasExclusions && flags != 0xFFFFFFFF) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                int atom2 = tbx+j;
-                                real3 oldForce = localData[atom2].force;
-                                real3 oldTorque = localData[atom2].torque;
-                                localData[atom2].force = make_real3(0);
-                                localData[atom2].torque = make_real3(0);
-                                computeOneInteraction(data, localData[tbx+j], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize);
-                                real3 newForce = localData[atom2].force;
-                                real3 newTorque = localData[atom2].torque;
-                                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#ifdef ENABLE_SHUFFLE
-                                    for (int i = 16; i >= 1; i /= 2) {
-                                        newForce.x += __shfl_xor(newForce.x, i, 32);
-                                        newForce.y += __shfl_xor(newForce.y, i, 32);
-                                        newForce.z += __shfl_xor(newForce.z, i, 32);
-                                        newTorque.x += __shfl_xor(newTorque.x, i, 32);
-                                        newTorque.y += __shfl_xor(newTorque.y, i, 32);
-                                        newTorque.z += __shfl_xor(newTorque.z, i, 32);
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].force -= newForce;
-                                        localData[atom2].torque -= newTorque;
-                                    }
 #else
-                                    int bufferIndex = 3*threadIdx.x;
+            unsigned int j = y*TILE_SIZE + tgx;
-                                    tempBuffer[bufferIndex] = newForce.x;
-                                    tempBuffer[bufferIndex+1] = newForce.y;
-                                    tempBuffer[bufferIndex+2] = newForce.z;
-                                    if (tgx % 4 == 0) {
-                                        tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
-                                        tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
-                                        tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].force.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                        localData[atom2].force.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                        localData[atom2].force.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                    }
-                                    tempBuffer[bufferIndex] = newTorque.x;
-                                    tempBuffer[bufferIndex+1] = newTorque.y;
-                                    tempBuffer[bufferIndex+2] = newTorque.z;
-                                    if (tgx % 4 == 0) {
-                                        tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
-                                        tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
-                                        tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
-                                    }
-                                    if (tgx == 0) {
-                                        localData[atom2].torque.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
-                                        localData[atom2].torque.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
-                                        localData[atom2].torque.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
-                                    }
-#endif
-                                }
-                            }
-                        }
-                        data.force *= -ENERGY_SCALE_FACTOR;
-                        data.torque *= -ENERGY_SCALE_FACTOR;
-                        localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
-                        localData[threadIdx.x].torque *= -ENERGY_SCALE_FACTOR;
-                        if (pos < end) {
-                            unsigned int offset = x*TILE_SIZE + tgx;
-                            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
-                            offset = y*TILE_SIZE + tgx;
-                            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
-                            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
-                        }
-                    }
-                }
-                else
 #endif
-                {
+            atomIndices[threadIdx.x] = j;
-                    // Compute the full set of interactions in this tile.
+            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            localData[threadIdx.x].force = make_real3(0);
-                    uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
+            localData[threadIdx.x].torque = make_real3(0);
-                    unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
+            // Compute forces.
-                    // Compute forces.
+            unsigned int tj = tgx;
-                    unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
-                    for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = atomIndices[tbx+tj];
-                        int atom2 = y*TILE_SIZE+tj;
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    computeOneInteraction(data, localData[tbx+tj], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize);
-                            float d = computeDScaleFactor(polarizationGroup, tj);
-                            float p = computePScaleFactor(covalent, polarizationGroup, tj);
-                            float m = computeMScaleFactor(covalent, tj);
-                            computeOneInteraction(data, localData[tbx+tj], hasExclusions, d, p, m, 1, energy, periodicBoxSize, invPeriodicBoxSize);
-                        }
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                    data.force *= -ENERGY_SCALE_FACTOR;
-                    data.torque *= ENERGY_SCALE_FACTOR;
-                    localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
-                    localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
-                    if (pos < end) {
-                        unsigned int offset = x*TILE_SIZE + tgx;
-                        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-                        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-                        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
-                        offset = y*TILE_SIZE + tgx;
-                        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
-                        atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
-                    }
                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
            }
+            data.force *= -ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
+            // Write results.
+            unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
+#ifdef USE_CUTOFF
+            offset = atomIndices[threadIdx.x];
+#else
+            offset = y*TILE_SIZE + tgx;
+#endif
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
        }
        pos++;
-    } while (pos < end);
+    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
 }