Restructured the use of force buffers in a new way that hopefully really works everywhere.

7943a339 · Peter Eastman · 13ef0ee8 · 7943a339 · 7943a339 · 7943a339
Commit 7943a339 authored May 09, 2011 by Peter Eastman
15 changed files
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
        defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
        defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
+        if (cl.getSIMDWidth() == 32)
+            defines["WARPS_PER_GROUP"] = OpenCLExpressionUtilities::intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
        string file;
        if (deviceIsCpu)
            file = OpenCLKernelSources::gbsaObc_cpu;
@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
        computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
        computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
-        computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
        force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
        force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL);
-        force1Kernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
    }
    if (nb.getUseCutoff()) {
-        computeBornSumKernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
-        computeBornSumKernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
-        force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
-        force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
+        computeBornSumKernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
+        computeBornSumKernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
+        force1Kernel.setArg<mm_float4>(9, cl.getPeriodicBoxSize());
+        force1Kernel.setArg<mm_float4>(10, cl.getInvPeriodicBoxSize());
        if (maxTiles < nb.getInteractingTiles().getSize()) {
            maxTiles = nb.getInteractingTiles().getSize();
            computeBornSumKernel.setArg<cl_uint>(10, maxTiles);
-            force1Kernel.setArg<cl_uint>(12, maxTiles);
+            force1Kernel.setArg<cl_uint>(11, maxTiles);
        }
    }
    cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        }
        map<string, string> replacements;
        replacements["COMPUTE_INTERACTION"] = n2EnergySource.str();
-        stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
+        stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
        if (force.getNumGlobalParameters() > 0)
            extraArgs << ", __constant float* globals";
        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
            string index = intToString(i+1);
            extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index;
            clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
-            load1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
+            declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
            load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
            recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
            storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")";
@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str();
        replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
        replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
+        replacements["DECLARE_ATOM1_DERIVATIVES"] = declare1.str();
        replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
        replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
        replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
-        pairValueKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
        pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
-        pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
            globals->upload(globalParamValues);
    }
    if (nb.getUseCutoff()) {
-        pairValueKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
-        pairValueKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
-        pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize());
-        pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize());
+        pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
+        pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
+        pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
+        pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
        if (maxTiles < nb.getInteractingTiles().getSize()) {
            maxTiles = nb.getInteractingTiles().getSize();
-            pairValueKernel.setArg<cl_uint>(13, maxTiles);
-            pairEnergyKernel.setArg<cl_uint>(14, maxTiles);
+            pairValueKernel.setArg<cl_uint>(12, maxTiles);
+            pairEnergyKernel.setArg<cl_uint>(13, maxTiles);
        }
    }
    cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -37,7 +37,7 @@ using namespace std;

 OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
        numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
-        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), forceBufferFlags(NULL) {
+        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL) {
    // Decide how many thread blocks and force buffers to use.

    deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
        numForceBuffers = numForceThreadBlocks;
    }
    else if (context.getSIMDWidth() == 32) {
-        numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-        forceThreadBlockSize = 256;
+        numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+        forceThreadBlockSize = 128;
        numForceBuffers = numForceThreadBlocks;
    }
    else {
@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
        delete blockCenter;
    if (blockBoundingBox != NULL)
        delete blockBoundingBox;
-    if (forceBufferFlags != NULL)
-        delete forceBufferFlags;
 }

 void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) {
@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        interactionCount->upload();
    }

-    // Create the flags for reserving force buffers.
-    
-    forceBufferFlags = new OpenCLArray<cl_uint>(context, numAtomBlocks*numForceThreadBlocks, "forceBufferFlags", false);
-    vector<cl_uint> forceBufferFlagsVec(forceBufferFlags->getSize(), 0);
-    forceBufferFlags->upload(forceBufferFlagsVec);
-
    // Create kernels.

    forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
 void OpenCLNonbondedUtilities::computeInteractions() {
    if (cutoff != -1.0) {
        if (useCutoff) {
-            forceKernel.setArg<mm_float4>(13, context.getPeriodicBoxSize());
-            forceKernel.setArg<mm_float4>(14, context.getInvPeriodicBoxSize());
+            forceKernel.setArg<mm_float4>(12, context.getPeriodicBoxSize());
+            forceKernel.setArg<mm_float4>(13, context.getInvPeriodicBoxSize());
        }
        context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
        newSize = numTiles;
    delete interactingTiles;
    interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
-    forceKernel.setArg<cl::Buffer>(11, interactingTiles->getDeviceBuffer());
-    forceKernel.setArg<cl_uint>(15, newSize);
+    forceKernel.setArg<cl::Buffer>(10, interactingTiles->getDeviceBuffer());
+    forceKernel.setArg<cl_uint>(14, newSize);
    findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
    findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
    if (context.getSIMDWidth() == 32 || deviceIsCpu) {
        delete interactionFlags;
        interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
-        forceKernel.setArg<cl::Buffer>(16, interactionFlags->getDeviceBuffer());
+        forceKernel.setArg<cl::Buffer>(15, interactionFlags->getDeviceBuffer());
        findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
        findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
        findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    kernel.setArg(index++, 4*forceThreadBlockSize*sizeof(cl_float), NULL);
    kernel.setArg<cl_uint>(index++, startTileIndex);
    kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
-    kernel.setArg<cl::Buffer>(index++, forceBufferFlags->getDeviceBuffer());
    if (useCutoff) {
        kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
@@ -196,12 +196,6 @@ public:
    OpenCLArray<cl_uint>& getExclusionRowIndices() {
        return *exclusionRowIndices;
    }
-    /**
-     * Get the array which contains flags for reserving force buffers.
-     */
-    OpenCLArray<cl_uint>& getForceBufferFlags() {
-        return *forceBufferFlags;
-    }
    /**
     * Get the index of the first tile this context is responsible for processing.
     */
@@ -245,7 +239,6 @@ private:
    OpenCLArray<cl_uint>* interactionCount;
    OpenCLArray<mm_float4>* blockCenter;
    OpenCLArray<mm_float4>* blockBoundingBox;
-    OpenCLArray<cl_uint>* forceBufferFlags;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;

--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
@@ -8,7 +8,7 @@

 __kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
 	__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
-        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else

--- a/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
@@ -9,7 +9,7 @@
 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
 	__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
-        __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* forceBufferFlags,
+        __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
 #else

--- a/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #define TILE_SIZE 32
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1;
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)];
-
-/**
- * Mark that a block in the force buffer is in use.
- */
-void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
-            ;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-/**
- * Mark that a block in the force buffer is no longer in use.
- */
-void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
-}
+#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
+#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];

 /**
 * Compute a force based on pair interactions.
 */
 __kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
 	__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
-        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else
@@ -48,132 +29,67 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
    unsigned int lasty = 0xFFFFFFFF;
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];
-
-    while (pos < end) {
+    __local int2* reservedBlocks = (__local int2*) exclusionRange;
+    
+    do {
        // Extract the coordinates of this tile
-        unsigned int x, y;
-#ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
-#endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
        const unsigned int tbx = get_local_id(0) - tgx;
        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int atom1 = x*TILE_SIZE + tgx;
+        unsigned int x, y;
        float4 force = 0.0f;
-        float4 posq1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
-
-        // Locate the exclusion data for this tile.
-
-#ifdef USE_EXCLUSIONS
-        if (tgx < 2)
-            exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-        if (tgx == 0)
-            exclusionIndex[localGroupIndex] = -1;
-        for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-            if (exclusionIndices[i] == y)
-                exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-        bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-#else
-        bool hasExclusions = false;
-#endif
-        if (x == y) {
-            // This tile is on the diagonal.
-
-            const unsigned int localAtomIndex = get_local_id(0);
-            local_posq[localAtomIndex] = posq1;
-            LOAD_LOCAL_PARAMETERS_FROM_1
-#ifdef USE_EXCLUSIONS
-            unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
-#endif
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
-#endif
-                int atom2 = tbx+j;
-                float4 posq2 = local_posq[atom2];
-                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+        DECLARE_ATOM1_DERIVATIVES
+        if (pos < end) {
 #ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                float r = SQRT(r2);
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+j;
-                float dEdR = 0.0f;
-                float tempEnergy = 0.0f;
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-                    COMPUTE_INTERACTION
-                    dEdR /= -r;
-                }
-                energy += 0.5f*tempEnergy;
-                delta.xyz *= dEdR;
-                force.xyz -= delta.xyz;
-#ifdef USE_CUTOFF
-                }
-#endif
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
 #endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            float4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS

-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            STORE_DERIVATIVES_1
-            releaseBuffer(x, forceBufferFlags);
-        }
-        else {
-            // This is an off-diagonal tile.
+            // Locate the exclusion data for this tile.

-            const unsigned int localAtomIndex = get_local_id(0);
-            if (lasty != y) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                local_posq[localAtomIndex] = posq[j];
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            }
-            local_force[localAtomIndex] = 0.0f;
-            CLEAR_LOCAL_DERIVATIVES
-#ifdef USE_CUTOFF
-            unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-            if (!hasExclusions && flags == 0) {
-                // No interactions in this tile.
-            }
-            else
+#ifdef USE_EXCLUSIONS
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+#else
+            bool hasExclusions = false;
 #endif
-            {
-                // Compute the full set of interactions in this tile.
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.

+                const unsigned int localAtomIndex = get_local_id(0);
+                local_posq[localAtomIndex] = posq1;
+                LOAD_LOCAL_PARAMETERS_FROM_1
 #ifdef USE_EXCLUSIONS
-                unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
 #endif
-                unsigned int tj = tgx;
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
-                    int atom2 = tbx+tj;
+                    int atom2 = tbx+j;
                    float4 posq2 = local_posq[atom2];
                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
@@ -187,44 +103,149 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
 #endif
                    float r = SQRT(r2);
                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+tj;
+                    atom2 = y*TILE_SIZE+j;
                    float dEdR = 0.0f;
                    float tempEnergy = 0.0f;
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
                        COMPUTE_INTERACTION
                        dEdR /= -r;
                    }
-		    energy += tempEnergy;
+                    energy += 0.5f*tempEnergy;
                    delta.xyz *= dEdR;
                    force.xyz -= delta.xyz;
-                    atom2 = tbx+tj;
-                    local_force[atom2].xyz += delta.xyz;
-                    RECORD_DERIVATIVE_2
 #ifdef USE_CUTOFF
                    }
 #endif
 #ifdef USE_EXCLUSIONS
                    excl >>= 1;
 #endif
-                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
+            else {
+                // This is an off-diagonal tile.

-            // Write results
+                const unsigned int localAtomIndex = get_local_id(0);
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    local_posq[localAtomIndex] = posq[j];
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                }
+                local_force[localAtomIndex] = 0.0f;
+                CLEAR_LOCAL_DERIVATIVES
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags == 0) {
+                    // No interactions in this tile.
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.

-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            STORE_DERIVATIVES_1
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
-            STORE_DERIVATIVES_2
-            releaseBuffer(y, forceBufferFlags);
+#ifdef USE_EXCLUSIONS
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(excl & 0x1);
+#endif
+                        int atom2 = tbx+tj;
+                        float4 posq2 = local_posq[atom2];
+                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                        float r = SQRT(r2);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+tj;
+                        float dEdR = 0.0f;
+                        float tempEnergy = 0.0f;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        atom2 = tbx+tj;
+                        local_force[atom2].xyz += delta.xyz;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                        }
+#endif
+#ifdef USE_EXCLUSIONS
+                        excl >>= 1;
+#endif
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
        }
        lasty = y;
+        
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
+        
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.
+
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.
+
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.
+
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset].xyz += force.xyz;
+                        STORE_DERIVATIVES_1
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset].xyz += local_force[get_local_id(0)].xyz;
+                        STORE_DERIVATIVES_2
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
+        }
        pos++;
-    }
+    } while (pos < end);
    energyBuffer[get_global_id(0)] += energy;
 }
--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
@@ -6,7 +6,7 @@

 __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
-        __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else

--- a/platforms/opencl/src/kernels/customGBValueN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_default.cl
@@ -7,7 +7,7 @@
 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
-        __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
 #else

--- a/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #define TILE_SIZE 32

-/**
- * Mark that a block in the value buffer is in use.
- */
-void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
-            ;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-/**
- * Mark that a block in the value buffer is no longer in use.
- */
-void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
-}
-
 /**
 * Compute a value based on pair interactions.
 */
 __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
-        __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else
@@ -46,222 +27,262 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
    unsigned int lasty = 0xFFFFFFFF;
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];
-
-    while (pos < end) {
+    __local int2* reservedBlocks = (__local int2*) exclusionRange;
+    
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float value = 0.0f;
+        if (pos < end) {
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
 #endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
            }
-        }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        float value = 0.0f;
-        float4 posq1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            float4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS

-        // Locate the exclusion data for this tile.
+            // Locate the exclusion data for this tile.

 #ifdef USE_EXCLUSIONS
-        if (tgx < 2)
-            exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-        if (tgx == 0)
-            exclusionIndex[localGroupIndex] = -1;
-        for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-            if (exclusionIndices[i] == y)
-                exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-        bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
 #else
-        bool hasExclusions = false;
+            bool hasExclusions = false;
 #endif
-        if (x == y) {
-            // This tile is on the diagonal.
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.

-            const unsigned int localAtomIndex = get_local_id(0);
-            local_posq[localAtomIndex] = posq1;
-            LOAD_LOCAL_PARAMETERS_FROM_1
+                const unsigned int localAtomIndex = get_local_id(0);
+                local_posq[localAtomIndex] = posq1;
+                LOAD_LOCAL_PARAMETERS_FROM_1
 #ifdef USE_EXCLUSIONS
-            unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
 #endif
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
+                    bool isExcluded = !(excl & 0x1);
 #endif
-                int atom2 = tbx+j;
-                float4 posq2 = local_posq[atom2];
-                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                    int atom2 = tbx+j;
+                    float4 posq2 = local_posq[atom2];
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < CUTOFF_SQUARED) {
 #endif
-                float r = SQRT(r2);
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+j;
-                float tempValue1 = 0.0f;
-                float tempValue2 = 0.0f;
+                    float r = SQRT(r2);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    float tempValue1 = 0.0f;
+                    float tempValue2 = 0.0f;
 #ifdef USE_EXCLUSIONS
-                if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
 #else
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
 #endif
-                    COMPUTE_VALUE
-                }
-                value += tempValue1;
+                        COMPUTE_VALUE
+                    }
+                    value += tempValue1;
 #ifdef USE_CUTOFF
-                }
+                    }
 #endif
 #ifdef USE_EXCLUSIONS
-                excl >>= 1;
+                    excl >>= 1;
 #endif
+                }
            }
+            else {
+                // This is an off-diagonal tile.

-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_value[offset] += value;
-            releaseBuffer(x, forceBufferFlags);
-        }
-        else {
-            // This is an off-diagonal tile.
-
-            if (lasty != y) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                local_posq[get_local_id(0)] = posq[j];
-                const unsigned int localAtomIndex = get_local_id(0);
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            }
-            local_value[get_local_id(0)] = 0.0f;
-#ifdef USE_CUTOFF
-            unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-            if (!hasExclusions && flags != 0xFFFFFFFF) {
-                if (flags == 0) {
-                    // No interactions in this tile.
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    local_posq[get_local_id(0)] = posq[j];
+                    const unsigned int localAtomIndex = get_local_id(0);
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                }
-                else {
-                    // Compute only a subset of the interactions in this tile.
+                local_value[get_local_id(0)] = 0.0f;
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.

-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        if ((flags&(1<<j)) != 0) {
-                            int atom2 = tbx+j;
-                            float4 posq2 = local_posq[atom2];
-                            float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                int atom2 = tbx+j;
+                                float4 posq2 = local_posq[atom2];
+                                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
-                            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                            float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                            float tempValue1 = 0.0f;
-                            float tempValue2 = 0.0f;
-                            if (r2 < CUTOFF_SQUARED) {
-                                float r = SQRT(r2);
-                                LOAD_ATOM2_PARAMETERS
-                                atom2 = y*TILE_SIZE+j;
-                                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                                    COMPUTE_VALUE
+                                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                                float tempValue1 = 0.0f;
+                                float tempValue2 = 0.0f;
+                                if (r2 < CUTOFF_SQUARED) {
+                                    float r = SQRT(r2);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                                        COMPUTE_VALUE
+                                    }
+                                    value += tempValue1;
                                }
-                                value += tempValue1;
-                            }
-                            tempBuffer[get_local_id(0)] = tempValue2;
+                                tempBuffer[get_local_id(0)] = tempValue2;

-                            // Sum the forces on atom2.
+                                // Sum the forces on atom2.

-                            if (tgx % 2 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
-                            if (tgx == 0)
-                                local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
+                                if (tgx % 2 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
+                                if (tgx % 4 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
+                                if (tgx % 8 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
+                                if (tgx % 16 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
+                                if (tgx == 0)
+                                    local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
+                            }
                        }
                    }
                }
-            }
-            else
+                else
 #endif
-            {
-                // Compute the full set of interactions in this tile.
+                {
+                    // Compute the full set of interactions in this tile.

 #ifdef USE_EXCLUSIONS
-                unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
 #endif
-                unsigned int tj = tgx;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
+                        bool isExcluded = !(excl & 0x1);
 #endif
-                    int atom2 = tbx+tj;
-                    float4 posq2 = local_posq[atom2];
-                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                        int atom2 = tbx+tj;
+                        float4 posq2 = local_posq[atom2];
+                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                        if (r2 < CUTOFF_SQUARED) {
 #endif
-                    float r = SQRT(r2);
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+tj;
-                    float tempValue1 = 0.0f;
-                    float tempValue2 = 0.0f;
+                        float r = SQRT(r2);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+tj;
+                        float tempValue1 = 0.0f;
+                        float tempValue2 = 0.0f;
 #ifdef USE_EXCLUSIONS
-                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
 #else
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
 #endif
-                        COMPUTE_VALUE
-                    }
-                    value += tempValue1;
-                    local_value[tbx+tj] += tempValue2;
+                            COMPUTE_VALUE
+                        }
+                        value += tempValue1;
+                        local_value[tbx+tj] += tempValue2;
 #ifdef USE_CUTOFF
-                    }
+                        }
 #endif
 #ifdef USE_EXCLUSIONS
-                    excl >>= 1;
+                        excl >>= 1;
 #endif
-                    tj = (tj + 1) & (TILE_SIZE - 1);
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
                }
            }
+        }
+        
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
+        
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.

-            // Write results
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.

-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_value[offset1] += value;
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_value[offset2] += local_value[get_local_id(0)];
-            releaseBuffer(y, forceBufferFlags);
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.
+
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_value[offset] += value;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_value[offset] += local_value[get_local_id(0)];
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
        }
        lasty = y;
        pos++;
-    }
+    } while (pos < end);
 }
--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -15,7 +15,7 @@ typedef struct {
 */

 __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
-        __local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po

 __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
        __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
-        __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else

--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
@@ -16,7 +16,7 @@ typedef struct {

 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
-        __local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else
@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
        __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
-        __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else

--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
@@ -11,30 +11,11 @@ typedef struct {
    float bornForce;
 } AtomData;

-/**
- * Mark that a block in the force buffer is in use.
- */
-void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
-            ;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-/**
- * Mark that a block in the force buffer is no longer in use.
- */
-void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
-}
-
 /**
 * Compute the Born sum.
 */
 __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
-        __local AtomData* localData, __local float* tempBuffer,  __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else
@@ -51,231 +32,271 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
    unsigned int end = (warp+1)*numTiles/totalWarps;
 #endif
    unsigned int lasty = 0xFFFFFFFF;
-
-    while (pos < end) {
+    __local int2 reservedBlocks[WARPS_PER_GROUP];
+    
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float bornSum = 0.0f;
+        if (pos < end) {
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
 #endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
            }
-        }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        float bornSum = 0.0f;
-        float4 posq1 = posq[atom1];
-        float2 params1 = global_params[atom1];
-        if (x == y) {
-            // This tile is on the diagonal.
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            float4 posq1 = posq[atom1];
+            float2 params1 = global_params[atom1];
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.

-            localData[get_local_id(0)].x = posq1.x;
-            localData[get_local_id(0)].y = posq1.y;
-            localData[get_local_id(0)].z = posq1.z;
-            localData[get_local_id(0)].q = posq1.w;
-            localData[get_local_id(0)].radius = params1.x;
-            localData[get_local_id(0)].scaledRadius = params1.y;
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
+                localData[get_local_id(0)].x = posq1.x;
+                localData[get_local_id(0)].y = posq1.y;
+                localData[get_local_id(0)].z = posq1.z;
+                localData[get_local_id(0)].q = posq1.w;
+                localData[get_local_id(0)].radius = params1.x;
+                localData[get_local_id(0)].scaledRadius = params1.y;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
 #ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
+                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
 #endif
-                    float invR = RSQRT(r2);
-                    float r = RECIP(invR);
-                    float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
-                    float rScaledRadiusJ = r+params2.y;
-                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
-                        float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
-                        float u_ij = RECIP(rScaledRadiusJ);
-                        float l_ij2 = l_ij*l_ij;
-                        float u_ij2 = u_ij*u_ij;
-                        float ratio = LOG(u_ij * RECIP(l_ij));
-                        bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
-                                         (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
-                        if (params1.x < params2.x-r)
-                            bornSum += 2.0f*(RECIP(params1.x)-l_ij);
+                        float invR = RSQRT(r2);
+                        float r = RECIP(invR);
+                        float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
+                        float rScaledRadiusJ = r+params2.y;
+                        if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
+                            float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
+                            float u_ij = RECIP(rScaledRadiusJ);
+                            float l_ij2 = l_ij*l_ij;
+                            float u_ij2 = u_ij*u_ij;
+                            float ratio = LOG(u_ij * RECIP(l_ij));
+                            bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                                             (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
+                            if (params1.x < params2.x-r)
+                                bornSum += 2.0f*(RECIP(params1.x)-l_ij);
+                        }
                    }
                }
            }
+            else {
+                // This is an off-diagonal tile.

-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_bornSum[offset] += bornSum;
-            releaseBuffer(x, forceBufferFlags);
-        }
-        else {
-            // This is an off-diagonal tile.
-
-            if (lasty != y) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                float4 tempPosq = posq[j];
-                localData[get_local_id(0)].x = tempPosq.x;
-                localData[get_local_id(0)].y = tempPosq.y;
-                localData[get_local_id(0)].z = tempPosq.z;
-                localData[get_local_id(0)].q = tempPosq.w;
-                float2 tempParams = global_params[j];
-                localData[get_local_id(0)].radius = tempParams.x;
-                localData[get_local_id(0)].scaledRadius = tempParams.y;
-            }
-            localData[get_local_id(0)].bornSum = 0.0f;
-#ifdef USE_CUTOFF
-            unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-            if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
-                if (flags == 0) {
-                    // No interactions in this tile.
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    float4 tempPosq = posq[j];
+                    localData[get_local_id(0)].x = tempPosq.x;
+                    localData[get_local_id(0)].y = tempPosq.y;
+                    localData[get_local_id(0)].z = tempPosq.z;
+                    localData[get_local_id(0)].q = tempPosq.w;
+                    float2 tempParams = global_params[j];
+                    localData[get_local_id(0)].radius = tempParams.x;
+                    localData[get_local_id(0)].scaledRadius = tempParams.y;
                }
-                else {
-                    // Compute only a subset of the interactions in this tile.
+                localData[get_local_id(0)].bornSum = 0.0f;
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.

-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        if ((flags&(1<<j)) != 0) {
-                            float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
 #ifdef USE_PERIODIC
-                            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                            float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                            tempBuffer[get_local_id(0)] = 0.0f;
+                                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                                tempBuffer[get_local_id(0)] = 0.0f;
 #ifdef USE_CUTOFF
-                            if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
-                            if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
+                                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
 #endif
-                                float invR = RSQRT(r2);
-                                float r = RECIP(invR);
-                                float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
-                                float rScaledRadiusJ = r+params2.y;
-                                if (params1.x < rScaledRadiusJ) {
-                                    float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
-                                    float u_ij = RECIP(rScaledRadiusJ);
-                                    float l_ij2 = l_ij*l_ij;
-                                    float u_ij2 = u_ij*u_ij;
-                                    float ratio = LOG(u_ij * RECIP(l_ij));
-                                    bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
-                                                     (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
-                                    if (params1.x < params2.x-r)
-                                        bornSum += 2.0f*(RECIP(params1.x)-l_ij);
+                                    float invR = RSQRT(r2);
+                                    float r = RECIP(invR);
+                                    float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
+                                    float rScaledRadiusJ = r+params2.y;
+                                    if (params1.x < rScaledRadiusJ) {
+                                        float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
+                                        float u_ij = RECIP(rScaledRadiusJ);
+                                        float l_ij2 = l_ij*l_ij;
+                                        float u_ij2 = u_ij*u_ij;
+                                        float ratio = LOG(u_ij * RECIP(l_ij));
+                                        bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                                                         (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
+                                        if (params1.x < params2.x-r)
+                                            bornSum += 2.0f*(RECIP(params1.x)-l_ij);
+                                    }
+                                    float rScaledRadiusI = r+params1.y;
+                                    if (params2.x < rScaledRadiusI) {
+                                        float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
+                                        float u_ij = RECIP(rScaledRadiusI);
+                                        float l_ij2 = l_ij*l_ij;
+                                        float u_ij2 = u_ij*u_ij;
+                                        float ratio = LOG(u_ij * RECIP(l_ij));
+                                        float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                                                         (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
+                                        if (params2.x < params1.x-r)
+                                            term += 2.0f*(RECIP(params2.x)-l_ij);
+                                        tempBuffer[get_local_id(0)] = term;
+                                    }
                                }
-                                float rScaledRadiusI = r+params1.y;
-                                if (params2.x < rScaledRadiusI) {
-                                    float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
-                                    float u_ij = RECIP(rScaledRadiusI);
-                                    float l_ij2 = l_ij*l_ij;
-                                    float u_ij2 = u_ij*u_ij;
-                                    float ratio = LOG(u_ij * RECIP(l_ij));
-                                    float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
-                                                     (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
-                                    if (params2.x < params1.x-r)
-                                        term += 2.0f*(RECIP(params2.x)-l_ij);
-                                    tempBuffer[get_local_id(0)] = term;
-                                }
-                            }

-                            // Sum the forces on atom j.
+                                // Sum the forces on atom j.

-                            if (tgx % 2 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
-                            if (tgx == 0)
-                                localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
+                                if (tgx % 2 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
+                                if (tgx % 4 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
+                                if (tgx % 8 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
+                                if (tgx % 16 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
+                                if (tgx == 0)
+                                    localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
+                            }
                        }
                    }
                }
-            }
-            else
+                else
 #endif
-            {
-                // Compute the full set of interactions in this tile.
+                {
+                    // Compute the full set of interactions in this tile.

-                unsigned int tj = tgx;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                        float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
 #ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
-                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
-                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
+                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
 #endif
-                        float invR = RSQRT(r2);
-                        float r = RECIP(invR);
-                        float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
-                        float rScaledRadiusJ = r+params2.y;
-                        if (params1.x < rScaledRadiusJ) {
-                            float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
-                            float u_ij = RECIP(rScaledRadiusJ);
-                            float l_ij2 = l_ij*l_ij;
-                            float u_ij2 = u_ij*u_ij;
-                            float ratio = LOG(u_ij * RECIP(l_ij));
-                            bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
-                                             (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
-                            if (params1.x < params2.x-r)
-                                bornSum += 2.0f*(RECIP(params1.x)-l_ij);
-                        }
-                        float rScaledRadiusI = r+params1.y;
-                        if (params2.x < rScaledRadiusI) {
-                            float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
-                            float u_ij = RECIP(rScaledRadiusI);
-                            float l_ij2 = l_ij*l_ij;
-                            float u_ij2 = u_ij*u_ij;
-                            float ratio = LOG(u_ij * RECIP(l_ij));
-                            float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
-                                             (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
-                            if (params2.x < params1.x-r)
-                                term += 2.0f*(RECIP(params2.x)-l_ij);
-                            localData[tbx+tj].bornSum += term;
+                            float invR = RSQRT(r2);
+                            float r = RECIP(invR);
+                            float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
+                            float rScaledRadiusJ = r+params2.y;
+                            if (params1.x < rScaledRadiusJ) {
+                                float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
+                                float u_ij = RECIP(rScaledRadiusJ);
+                                float l_ij2 = l_ij*l_ij;
+                                float u_ij2 = u_ij*u_ij;
+                                float ratio = LOG(u_ij * RECIP(l_ij));
+                                bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                                                 (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
+                                if (params1.x < params2.x-r)
+                                    bornSum += 2.0f*(RECIP(params1.x)-l_ij);
+                            }
+                            float rScaledRadiusI = r+params1.y;
+                            if (params2.x < rScaledRadiusI) {
+                                float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
+                                float u_ij = RECIP(rScaledRadiusI);
+                                float l_ij2 = l_ij*l_ij;
+                                float u_ij2 = u_ij*u_ij;
+                                float ratio = LOG(u_ij * RECIP(l_ij));
+                                float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                                                 (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
+                                if (params2.x < params1.x-r)
+                                    term += 2.0f*(RECIP(params2.x)-l_ij);
+                                localData[tbx+tj].bornSum += term;
+                            }
                        }
+                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
-                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
+        }
+        
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
+        
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.
+
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.

-            // Write results
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.

-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_bornSum[offset1] += bornSum;
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
-            releaseBuffer(y, forceBufferFlags);
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_bornSum[offset] += bornSum;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_bornSum[offset] += localData[get_local_id(0)].bornSum;
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
        }
        lasty = y;
        pos++;
-    }
+    } while (pos < end);
 }

 /**
@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po

 __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
        __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
-        __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else
@@ -302,113 +323,171 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
-
-    while (pos < end) {
+    __local int2 reservedBlocks[WARPS_PER_GROUP];
+    
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float4 force = 0.0f;
+        if (pos < end) {
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
 #endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
            }
-        }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        float4 force = 0.0f;
-        float4 posq1 = posq[atom1];
-        float bornRadius1 = global_bornRadii[atom1];
-        if (x == y) {
-            // This tile is on the diagonal.
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            float4 posq1 = posq[atom1];
+            float bornRadius1 = global_bornRadii[atom1];
+            if (x == y) {
+                // This tile is on the diagonal.

-            localData[get_local_id(0)].x = posq1.x;
-            localData[get_local_id(0)].y = posq1.y;
-            localData[get_local_id(0)].z = posq1.z;
-            localData[get_local_id(0)].q = posq1.w;
-            localData[get_local_id(0)].bornRadius = bornRadius1;
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-                    float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
-                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                localData[get_local_id(0)].x = posq1.x;
+                localData[get_local_id(0)].y = posq1.y;
+                localData[get_local_id(0)].z = posq1.z;
+                localData[get_local_id(0)].q = posq1.w;
+                localData[get_local_id(0)].bornRadius = bornRadius1;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
+                        float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
+                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    float invR = RSQRT(r2);
-                    float r = RECIP(invR);
-                    float bornRadius2 = localData[tbx+j].bornRadius;
-                    float alpha2_ij = bornRadius1*bornRadius2;
-                    float D_ij = r2*RECIP(4.0f*alpha2_ij);
-                    float expTerm = EXP(-D_ij);
-                    float denominator2 = r2 + alpha2_ij*expTerm;
-                    float denominator = SQRT(denominator2);
-                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
-                    float Gpol = tempEnergy*RECIP(denominator2);
-                    float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                    float dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        float invR = RSQRT(r2);
+                        float r = RECIP(invR);
+                        float bornRadius2 = localData[tbx+j].bornRadius;
+                        float alpha2_ij = bornRadius1*bornRadius2;
+                        float D_ij = r2*RECIP(4.0f*alpha2_ij);
+                        float expTerm = EXP(-D_ij);
+                        float denominator2 = r2 + alpha2_ij*expTerm;
+                        float denominator = SQRT(denominator2);
+                        float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                        float Gpol = tempEnergy*RECIP(denominator2);
+                        float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                        float dEdR = Gpol*(1.0f - 0.25f*expTerm);
 #ifdef USE_CUTOFF
-                    if (r2 > CUTOFF_SQUARED) {
-                        dEdR = 0.0f;
-                        tempEnergy  = 0.0f;
-                        dGpol_dalpha2_ij = 0.0f;
-                    }
+                        if (r2 > CUTOFF_SQUARED) {
+                            dEdR = 0.0f;
+                            tempEnergy  = 0.0f;
+                            dGpol_dalpha2_ij = 0.0f;
+                        }
 #endif
-                    force.w += dGpol_dalpha2_ij*bornRadius2;
-                    energy += 0.5f*tempEnergy;
-                    delta.xyz *= dEdR;
-                    force.xyz -= delta.xyz;
+                        force.w += dGpol_dalpha2_ij*bornRadius2;
+                        energy += 0.5f*tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                    }
                }
            }
+            else {
+                // This is an off-diagonal tile.

-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset].xyz += force.xyz;
-            global_bornForce[offset] += force.w;
-            releaseBuffer(x, forceBufferFlags);
-        }
-        else {
-            // This is an off-diagonal tile.
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    float4 tempPosq = posq[j];
+                    localData[get_local_id(0)].x = tempPosq.x;
+                    localData[get_local_id(0)].y = tempPosq.y;
+                    localData[get_local_id(0)].z = tempPosq.z;
+                    localData[get_local_id(0)].q = tempPosq.w;
+                    localData[get_local_id(0)].bornRadius = global_bornRadii[j];
+                }
+                localData[get_local_id(0)].fx = 0.0f;
+                localData[get_local_id(0)].fy = 0.0f;
+                localData[get_local_id(0)].fz = 0.0f;
+                localData[get_local_id(0)].fw = 0.0f;
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.

-            if (lasty != y) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                float4 tempPosq = posq[j];
-                localData[get_local_id(0)].x = tempPosq.x;
-                localData[get_local_id(0)].y = tempPosq.y;
-                localData[get_local_id(0)].z = tempPosq.z;
-                localData[get_local_id(0)].q = tempPosq.w;
-                localData[get_local_id(0)].bornRadius = global_bornRadii[j];
-            }
-            localData[get_local_id(0)].fx = 0.0f;
-            localData[get_local_id(0)].fy = 0.0f;
-            localData[get_local_id(0)].fz = 0.0f;
-            localData[get_local_id(0)].fw = 0.0f;
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
+                                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                                float invR = RSQRT(r2);
+                                float r = RECIP(invR);
+                                float bornRadius2 = localData[tbx+j].bornRadius;
+                                float alpha2_ij = bornRadius1*bornRadius2;
+                                float D_ij = r2*RECIP(4.0f*alpha2_ij);
+                                float expTerm = EXP(-D_ij);
+                                float denominator2 = r2 + alpha2_ij*expTerm;
+                                float denominator = SQRT(denominator2);
+                                float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                                float Gpol = tempEnergy*RECIP(denominator2);
+                                float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                                float dEdR = Gpol*(1.0f - 0.25f*expTerm);
 #ifdef USE_CUTOFF
-            unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-            if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
-                if (flags == 0) {
-                    // No interactions in this tile.
+                                if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
+#else
+                                if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
+#endif
+                                    dEdR = 0.0f;
+                                    dGpol_dalpha2_ij = 0.0f;
+                                    tempEnergy = 0.0f;
+                                }
+                                energy += tempEnergy;
+                                force.w += dGpol_dalpha2_ij*bornRadius2;
+                                delta.xyz *= dEdR;
+                                force.xyz -= delta.xyz;
+                                tempBuffer[get_local_id(0)] = (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
+
+                                // Sum the forces on atom j.
+
+                                if (tgx % 2 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
+                                if (tgx % 4 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
+                                if (tgx % 8 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
+                                if (tgx % 16 == 0)
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
+                                if (tgx == 0) {
+                                    float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
+                                    localData[tbx+j].fx += sum.x;
+                                    localData[tbx+j].fy += sum.y;
+                                    localData[tbx+j].fz += sum.z;
+                                    localData[tbx+j].fw += sum.w;
+                                }
+                            }
+                        }
+                    }
                }
-                else {
-                    // Compute only a subset of the interactions in this tile.
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.

+                    unsigned int tj = tgx;
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        if ((flags&(1<<j)) != 0) {
-                            float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
+                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
+                            float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
                            float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
@@ -418,7 +497,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                            float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                            float invR = RSQRT(r2);
                            float r = RECIP(invR);
-                            float bornRadius2 = localData[tbx+j].bornRadius;
+                            float bornRadius2 = localData[tbx+tj].bornRadius;
                            float alpha2_ij = bornRadius1*bornRadius2;
                            float D_ij = r2*RECIP(4.0f*alpha2_ij);
                            float expTerm = EXP(-D_ij);
@@ -429,104 +508,83 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                            float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                            float dEdR = Gpol*(1.0f - 0.25f*expTerm);
 #ifdef USE_CUTOFF
-                            if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
-#else
-                            if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
-#endif
+                            if (r2 > CUTOFF_SQUARED) {
                                dEdR = 0.0f;
+                                tempEnergy  = 0.0f;
                                dGpol_dalpha2_ij = 0.0f;
-				tempEnergy = 0.0f;
                            }
-			    energy += tempEnergy;
+#endif
                            force.w += dGpol_dalpha2_ij*bornRadius2;
+                            energy += tempEnergy;
                            delta.xyz *= dEdR;
                            force.xyz -= delta.xyz;
-                            tempBuffer[get_local_id(0)] = (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
-
-                            // Sum the forces on atom j.
-
-                            if (tgx % 2 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
-                            if (tgx == 0) {
-                                float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
-                                localData[tbx+j].fx += sum.x;
-                                localData[tbx+j].fy += sum.y;
-                                localData[tbx+j].fz += sum.z;
-                                localData[tbx+j].fw += sum.w;
-                            }
+                            localData[tbx+tj].fx += delta.x;
+                            localData[tbx+tj].fy += delta.y;
+                            localData[tbx+tj].fz += delta.z;
+                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
                        }
+                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
                }
            }
-            else
-#endif
-            {
-                // Compute the full set of interactions in this tile.
+        }
+        
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
+        
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.

-                unsigned int tj = tgx;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-                        float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
-                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
-#ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                        float invR = RSQRT(r2);
-                        float r = RECIP(invR);
-                        float bornRadius2 = localData[tbx+tj].bornRadius;
-                        float alpha2_ij = bornRadius1*bornRadius2;
-                        float D_ij = r2*RECIP(4.0f*alpha2_ij);
-                        float expTerm = EXP(-D_ij);
-                        float denominator2 = r2 + alpha2_ij*expTerm;
-                        float denominator = SQRT(denominator2);
-                        float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
-                        float Gpol = tempEnergy*RECIP(denominator2);
-                        float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                        float dEdR = Gpol*(1.0f - 0.25f*expTerm);
-#ifdef USE_CUTOFF
-                        if (r2 > CUTOFF_SQUARED) {
-                            dEdR = 0.0f;
-                            tempEnergy  = 0.0f;
-                            dGpol_dalpha2_ij = 0.0f;
-                        }
-#endif
-                        force.w += dGpol_dalpha2_ij*bornRadius2;
-                        energy += tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
-                        localData[tbx+tj].fx += delta.x;
-                        localData[tbx+tj].fy += delta.y;
-                        localData[tbx+tj].fz += delta.z;
-                        localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
-                    }
-                    tj = (tj + 1) & (TILE_SIZE - 1);
-                }
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.

-            // Write results
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.

-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            global_bornForce[offset1] += force.w;
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
-            global_bornForce[offset2] += localData[get_local_id(0)].fw;
-            releaseBuffer(y, forceBufferFlags);
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset].xyz += force.xyz;
+                        global_bornForce[offset] += force.w;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+                        global_bornForce[offset] += localData[get_local_id(0)].fw;
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
        }
        lasty = y;
        pos++;
-    }
+    } while (pos < end);
    energyBuffer[get_global_id(0)] += energy;
 }
--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -13,7 +13,7 @@ typedef struct {

 __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
-        unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
+        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
@@ -14,7 +14,7 @@ typedef struct {
 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
-        unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
+        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
@@ -8,31 +8,12 @@ typedef struct {
    ATOM_PARAMETER_DATA
 } AtomData;

-/**
- * Mark that a block in the force buffer is in use.
- */
-void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
-            ;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-/**
- * Mark that a block in the force buffer is no longer in use.
- */
-void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
-}
-
 /**
 * Compute nonbonded interactions.
 */
 __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
-        unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
+        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else
@@ -53,264 +34,306 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
    unsigned int lasty = 0xFFFFFFFF;
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];
-
-    while (pos < end) {
+    __local int2* reservedBlocks = (__local int2*) exclusionRange;
+    
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float4 force = 0.0f;
+        if (pos < end) {
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
 #endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
            }
-        }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        float4 force = 0.0f;
-        float4 posq1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            float4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS

-        // Locate the exclusion data for this tile.
+            // Locate the exclusion data for this tile.

 #ifdef USE_EXCLUSIONS
-        if (tgx < 2)
-            exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-        if (tgx == 0)
-            exclusionIndex[localGroupIndex] = -1;
-        for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-            if (exclusionIndices[i] == y)
-                exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-        bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
 #else
-        bool hasExclusions = false;
+            bool hasExclusions = false;
 #endif
-        if (x == y) {
-            // This tile is on the diagonal.
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.

-            const unsigned int localAtomIndex = get_local_id(0);
-            localData[localAtomIndex].x = posq1.x;
-            localData[localAtomIndex].y = posq1.y;
-            localData[localAtomIndex].z = posq1.z;
-            localData[localAtomIndex].q = posq1.w;
-            LOAD_LOCAL_PARAMETERS_FROM_1
+                const unsigned int localAtomIndex = get_local_id(0);
+                localData[localAtomIndex].x = posq1.x;
+                localData[localAtomIndex].y = posq1.y;
+                localData[localAtomIndex].z = posq1.z;
+                localData[localAtomIndex].q = posq1.w;
+                LOAD_LOCAL_PARAMETERS_FROM_1
 #ifdef USE_EXCLUSIONS
-            unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
 #endif
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
+                    bool isExcluded = !(excl & 0x1);
 #endif
-                int atom2 = tbx+j;
-                float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                    int atom2 = tbx+j;
+                    float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                float invR = RSQRT(r2);
-                float r = RECIP(invR);
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+j;
+                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    float invR = RSQRT(r2);
+                    float r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
-                float dEdR = 0.0f;
+                    float dEdR = 0.0f;
 #else
-                float4 dEdR1 = (float4) 0.0f;
-                float4 dEdR2 = (float4) 0.0f;
+                    float4 dEdR1 = (float4) 0.0f;
+                    float4 dEdR2 = (float4) 0.0f;
 #endif
-                float tempEnergy = 0.0f;
-                COMPUTE_INTERACTION
-                energy += 0.5f*tempEnergy;
+                    float tempEnergy = 0.0f;
+                    COMPUTE_INTERACTION
+                    energy += 0.5f*tempEnergy;
 #ifdef USE_SYMMETRIC
-                force.xyz -= delta.xyz*dEdR;
+                    force.xyz -= delta.xyz*dEdR;
 #else
-                force.xyz -= dEdR1.xyz;
+                    force.xyz -= dEdR1.xyz;
 #endif
-                excl >>= 1;
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
            }
+            else {
+                // This is an off-diagonal tile.

-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset].xyz += force.xyz;
-            releaseBuffer(x, forceBufferFlags);
-        }
-        else {
-            // This is an off-diagonal tile.
-
-            const unsigned int localAtomIndex = get_local_id(0);
-            if (lasty != y) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                float4 tempPosq = posq[j];
-                localData[localAtomIndex].x = tempPosq.x;
-                localData[localAtomIndex].y = tempPosq.y;
-                localData[localAtomIndex].z = tempPosq.z;
-                localData[localAtomIndex].q = tempPosq.w;
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            }
-            localData[localAtomIndex].fx = 0.0f;
-            localData[localAtomIndex].fy = 0.0f;
-            localData[localAtomIndex].fz = 0.0f;
-#ifdef USE_CUTOFF
-            unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-            if (!hasExclusions && flags != 0xFFFFFFFF) {
-                if (flags == 0) {
-                    // No interactions in this tile.
+                const unsigned int localAtomIndex = get_local_id(0);
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    float4 tempPosq = posq[j];
+                    localData[localAtomIndex].x = tempPosq.x;
+                    localData[localAtomIndex].y = tempPosq.y;
+                    localData[localAtomIndex].z = tempPosq.z;
+                    localData[localAtomIndex].q = tempPosq.w;
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                }
-                else {
-                    // Compute only a subset of the interactions in this tile.
+                localData[localAtomIndex].fx = 0.0f;
+                localData[localAtomIndex].fy = 0.0f;
+                localData[localAtomIndex].fz = 0.0f;
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.

-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        if ((flags&(1<<j)) != 0) {
-                            bool isExcluded = false;
-                            int atom2 = tbx+j;
-                            float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                            float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                bool isExcluded = false;
+                                int atom2 = tbx+j;
+                                float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
-                            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                            float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                            float invR = RSQRT(r2);
-                            float r = RECIP(invR);
-                            LOAD_ATOM2_PARAMETERS
-                            atom2 = y*TILE_SIZE+j;
+                                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                                float invR = RSQRT(r2);
+                                float r = RECIP(invR);
+                                LOAD_ATOM2_PARAMETERS
+                                atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
-                            float dEdR = 0.0f;
+                                float dEdR = 0.0f;
 #else
-                            float4 dEdR1 = (float4) 0.0f;
-                            float4 dEdR2 = (float4) 0.0f;
+                                float4 dEdR1 = (float4) 0.0f;
+                                float4 dEdR2 = (float4) 0.0f;
 #endif
-                            float tempEnergy = 0.0f;
-                            COMPUTE_INTERACTION
-			    energy += tempEnergy;
-                            int bufferIndex = 3*get_local_id(0);
+                                float tempEnergy = 0.0f;
+                                COMPUTE_INTERACTION
+                                energy += tempEnergy;
+                                int bufferIndex = 3*get_local_id(0);
 #ifdef USE_SYMMETRIC
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
-                            tempBuffer[bufferIndex] = delta.x;
-                            tempBuffer[bufferIndex+1] = delta.y;
-                            tempBuffer[bufferIndex+2] = delta.z;
+                                delta.xyz *= dEdR;
+                                force.xyz -= delta.xyz;
+                                tempBuffer[bufferIndex] = delta.x;
+                                tempBuffer[bufferIndex+1] = delta.y;
+                                tempBuffer[bufferIndex+2] = delta.z;
 #else
-                            force.xyz -= dEdR1.xyz;
-                            tempBuffer[bufferIndex] = dEdR2.x;
-                            tempBuffer[bufferIndex+1] = dEdR2.y;
-                            tempBuffer[bufferIndex+2] = dEdR2.z;
+                                force.xyz -= dEdR1.xyz;
+                                tempBuffer[bufferIndex] = dEdR2.x;
+                                tempBuffer[bufferIndex+1] = dEdR2.y;
+                                tempBuffer[bufferIndex+2] = dEdR2.z;
 #endif

-                            // Sum the forces on atom2.
+                                // Sum the forces on atom2.

-                            if (tgx % 2 == 0) {
-                                tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
-                                tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
-                                tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
-                            }
-                            if (tgx % 4 == 0) {
-                                tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
-                                tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
-                                tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
-                            }
-                            if (tgx % 8 == 0) {
-                                tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
-                                tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
-                                tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
-                            }
-                            if (tgx % 16 == 0) {
-                                tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
-                                tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
-                                tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
-                            }
-                            if (tgx == 0) {
-                                localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
-                                localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
-                                localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
+                                if (tgx % 2 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
+                                }
+                                if (tgx % 4 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
+                                }
+                                if (tgx % 8 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
+                                }
+                                if (tgx % 16 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
+                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
+                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
+                                }
                            }
                        }
                    }
                }
-            }
-            else
+                else
 #endif
-            {
-                // Compute the full set of interactions in this tile.
+                {
+                    // Compute the full set of interactions in this tile.

 #ifdef USE_EXCLUSIONS
-                unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
 #endif
-                unsigned int tj = tgx;
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
+                        bool isExcluded = !(excl & 0x1);
 #endif
-                    int atom2 = tbx+tj;
-                    float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                        int atom2 = tbx+tj;
+                        float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    float invR = RSQRT(r2);
-                    float r = RECIP(invR);
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+tj;
+                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        float invR = RSQRT(r2);
+                        float r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+tj;
 #ifdef USE_SYMMETRIC
-                    float dEdR = 0.0f;
+                        float dEdR = 0.0f;
 #else
-                    float4 dEdR1 = (float4) 0.0f;
-                    float4 dEdR2 = (float4) 0.0f;
+                        float4 dEdR1 = (float4) 0.0f;
+                        float4 dEdR2 = (float4) 0.0f;
 #endif
-                    float tempEnergy = 0.0f;
-                    COMPUTE_INTERACTION
-		    energy += tempEnergy;
+                        float tempEnergy = 0.0f;
+                        COMPUTE_INTERACTION
+                        energy += tempEnergy;
 #ifdef USE_SYMMETRIC
-                    delta.xyz *= dEdR;
-                    force.xyz -= delta.xyz;
-                    localData[tbx+tj].fx += delta.x;
-                    localData[tbx+tj].fy += delta.y;
-                    localData[tbx+tj].fz += delta.z;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        localData[tbx+tj].fx += delta.x;
+                        localData[tbx+tj].fy += delta.y;
+                        localData[tbx+tj].fz += delta.z;
 #else
-                    force.xyz -= dEdR1.xyz;
-                    localData[tbx+tj].fx += dEdR2.x;
-                    localData[tbx+tj].fy += dEdR2.y;
-                    localData[tbx+tj].fz += dEdR2.z;
+                        force.xyz -= dEdR1.xyz;
+                        localData[tbx+tj].fx += dEdR2.x;
+                        localData[tbx+tj].fy += dEdR2.y;
+                        localData[tbx+tj].fz += dEdR2.z;
 #endif
 #ifdef USE_EXCLUSIONS
-                    excl >>= 1;
+                        excl >>= 1;
 #endif
-                    tj = (tj + 1) & (TILE_SIZE - 1);
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
                }
            }
+        }
+        
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
+        
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.

-            // Write results
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.

-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
-            releaseBuffer(y, forceBufferFlags);
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.
+
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset].xyz += force.xyz;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
        }
        lasty = y;
        pos++;
-    }
+    } while (pos < end);
    energyBuffer[get_global_id(0)] += energy;
 }