Restructured the use of force buffers in a new way that hopefully really works everywhere.

7943a339 · Peter Eastman · 13ef0ee8 · 7943a339 · 7943a339 · 7943a339
Commit 7943a339 authored May 09, 2011 by Peter Eastman
15 changed files
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
        defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
        defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
+        if (cl.getSIMDWidth() == 32)
+            defines["WARPS_PER_GROUP"] = OpenCLExpressionUtilities::intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
        string file;
        if (deviceIsCpu)
            file = OpenCLKernelSources::gbsaObc_cpu;
@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
        computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
        computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
-        computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
        force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
        force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL);
-        force1Kernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
    }
    if (nb.getUseCutoff()) {
-        computeBornSumKernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
-        computeBornSumKernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
-        force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
-        force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
+        computeBornSumKernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
+        computeBornSumKernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
+        force1Kernel.setArg<mm_float4>(9, cl.getPeriodicBoxSize());
+        force1Kernel.setArg<mm_float4>(10, cl.getInvPeriodicBoxSize());
        if (maxTiles < nb.getInteractingTiles().getSize()) {
            maxTiles = nb.getInteractingTiles().getSize();
            computeBornSumKernel.setArg<cl_uint>(10, maxTiles);
-            force1Kernel.setArg<cl_uint>(12, maxTiles);
+            force1Kernel.setArg<cl_uint>(11, maxTiles);
        }
    }
    cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        }
        map<string, string> replacements;
        replacements["COMPUTE_INTERACTION"] = n2EnergySource.str();
-        stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
+        stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
        if (force.getNumGlobalParameters() > 0)
            extraArgs << ", __constant float* globals";
        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
            string index = intToString(i+1);
            extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index;
            clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
-            load1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
+            declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
            load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
            recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
            storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")";
@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str();
        replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
        replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
+        replacements["DECLARE_ATOM1_DERIVATIVES"] = declare1.str();
        replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
        replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
        replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
-        pairValueKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
        pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
-        pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
        if (nb.getUseCutoff()) {
            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
            globals->upload(globalParamValues);
    }
    if (nb.getUseCutoff()) {
-        pairValueKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
-        pairValueKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
-        pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize());
-        pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize());
+        pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
+        pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
+        pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
+        pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
        if (maxTiles < nb.getInteractingTiles().getSize()) {
            maxTiles = nb.getInteractingTiles().getSize();
-            pairValueKernel.setArg<cl_uint>(13, maxTiles);
-            pairEnergyKernel.setArg<cl_uint>(14, maxTiles);
+            pairValueKernel.setArg<cl_uint>(12, maxTiles);
+            pairEnergyKernel.setArg<cl_uint>(13, maxTiles);
        }
    }
    cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -37,7 +37,7 @@ using namespace std;

 OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
        numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
-        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), forceBufferFlags(NULL) {
+        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL) {
    // Decide how many thread blocks and force buffers to use.

    deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
        numForceBuffers = numForceThreadBlocks;
    }
    else if (context.getSIMDWidth() == 32) {
-        numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-        forceThreadBlockSize = 256;
+        numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+        forceThreadBlockSize = 128;
        numForceBuffers = numForceThreadBlocks;
    }
    else {
@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
        delete blockCenter;
    if (blockBoundingBox != NULL)
        delete blockBoundingBox;
-    if (forceBufferFlags != NULL)
-        delete forceBufferFlags;
 }

 void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) {
@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        interactionCount->upload();
    }

-    // Create the flags for reserving force buffers.
-    
-    forceBufferFlags = new OpenCLArray<cl_uint>(context, numAtomBlocks*numForceThreadBlocks, "forceBufferFlags", false);
-    vector<cl_uint> forceBufferFlagsVec(forceBufferFlags->getSize(), 0);
-    forceBufferFlags->upload(forceBufferFlagsVec);
-
    // Create kernels.

    forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
 void OpenCLNonbondedUtilities::computeInteractions() {
    if (cutoff != -1.0) {
        if (useCutoff) {
-            forceKernel.setArg<mm_float4>(13, context.getPeriodicBoxSize());
-            forceKernel.setArg<mm_float4>(14, context.getInvPeriodicBoxSize());
+            forceKernel.setArg<mm_float4>(12, context.getPeriodicBoxSize());
+            forceKernel.setArg<mm_float4>(13, context.getInvPeriodicBoxSize());
        }
        context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
        newSize = numTiles;
    delete interactingTiles;
    interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
-    forceKernel.setArg<cl::Buffer>(11, interactingTiles->getDeviceBuffer());
-    forceKernel.setArg<cl_uint>(15, newSize);
+    forceKernel.setArg<cl::Buffer>(10, interactingTiles->getDeviceBuffer());
+    forceKernel.setArg<cl_uint>(14, newSize);
    findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
    findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
    if (context.getSIMDWidth() == 32 || deviceIsCpu) {
        delete interactionFlags;
        interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
-        forceKernel.setArg<cl::Buffer>(16, interactionFlags->getDeviceBuffer());
+        forceKernel.setArg<cl::Buffer>(15, interactionFlags->getDeviceBuffer());
        findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
        findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
        findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    kernel.setArg(index++, 4*forceThreadBlockSize*sizeof(cl_float), NULL);
    kernel.setArg<cl_uint>(index++, startTileIndex);
    kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
-    kernel.setArg<cl::Buffer>(index++, forceBufferFlags->getDeviceBuffer());
    if (useCutoff) {
        kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
@@ -196,12 +196,6 @@ public:
    OpenCLArray<cl_uint>& getExclusionRowIndices() {
        return *exclusionRowIndices;
    }
-    /**
-     * Get the array which contains flags for reserving force buffers.
-     */
-    OpenCLArray<cl_uint>& getForceBufferFlags() {
-        return *forceBufferFlags;
-    }
    /**
     * Get the index of the first tile this context is responsible for processing.
     */
@@ -245,7 +239,6 @@ private:
    OpenCLArray<cl_uint>* interactionCount;
    OpenCLArray<mm_float4>* blockCenter;
    OpenCLArray<mm_float4>* blockBoundingBox;
-    OpenCLArray<cl_uint>* forceBufferFlags;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;

--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
@@ -8,7 +8,7 @@

 __kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
 	__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
-        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else

--- a/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
@@ -9,7 +9,7 @@
 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
 	__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
-        __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* forceBufferFlags,
+        __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
 #else

--- a/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #define TILE_SIZE 32
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1;
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)];
-
-/**
- * Mark that a block in the force buffer is in use.
- */
-void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
-            ;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-/**
- * Mark that a block in the force buffer is no longer in use.
- */
-void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
-}
+#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
+#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];

 /**
 * Compute a force based on pair interactions.
 */
 __kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
 	__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
-        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else
@@ -48,10 +29,17 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
    unsigned int lasty = 0xFFFFFFFF;
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];
+    __local int2* reservedBlocks = (__local int2*) exclusionRange;
    
-    while (pos < end) {
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float4 force = 0.0f;
+        DECLARE_ATOM1_DERIVATIVES
+        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {
                ushort2 tileIndices = tiles[pos];
@@ -68,11 +56,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
                }
            }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
            unsigned int atom1 = x*TILE_SIZE + tgx;
-        float4 force = 0.0f;
            float4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS

@@ -90,7 +74,9 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
 #else
            bool hasExclusions = false;
 #endif
-        if (x == y) {
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
                // This tile is on the diagonal.

                const unsigned int localAtomIndex = get_local_id(0);
@@ -134,14 +120,6 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
                    excl >>= 1;
 #endif
                }
-
-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            STORE_DERIVATIVES_1
-            releaseBuffer(x, forceBufferFlags);
            }
            else {
                // This is an off-diagonal tile.
@@ -209,22 +187,65 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
                }
+            }
+        }
+        lasty = y;
+        
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
+        
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.
+
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.

-            // Write results
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.

-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset].xyz += force.xyz;
                        STORE_DERIVATIVES_1
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset].xyz += local_force[get_local_id(0)].xyz;
                        STORE_DERIVATIVES_2
-            releaseBuffer(y, forceBufferFlags);
                    }
-        lasty = y;
-        pos++;
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
            }
+        }
+        pos++;
+    } while (pos < end);
    energyBuffer[get_global_id(0)] += energy;
 }
--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
@@ -6,7 +6,7 @@

 __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
-        __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else

--- a/platforms/opencl/src/kernels/customGBValueN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_default.cl
@@ -7,7 +7,7 @@
 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
-        __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
 #else

--- a/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #define TILE_SIZE 32

-/**
- * Mark that a block in the value buffer is in use.
- */
-void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
-            ;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-/**
- * Mark that a block in the value buffer is no longer in use.
- */
-void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
-}
-
 /**
 * Compute a value based on pair interactions.
 */
 __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
-        __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else
@@ -46,10 +27,16 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
    unsigned int lasty = 0xFFFFFFFF;
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];
+    __local int2* reservedBlocks = (__local int2*) exclusionRange;
    
-    while (pos < end) {
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float value = 0.0f;
+        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {
                ushort2 tileIndices = tiles[pos];
@@ -66,11 +53,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
                }
            }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
            unsigned int atom1 = x*TILE_SIZE + tgx;
-        float value = 0.0f;
            float4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS

@@ -88,7 +71,9 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
 #else
            bool hasExclusions = false;
 #endif
-        if (x == y) {
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
                // This tile is on the diagonal.

                const unsigned int localAtomIndex = get_local_id(0);
@@ -133,13 +118,6 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
                    excl >>= 1;
 #endif
                }
-
-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_value[offset] += value;
-            releaseBuffer(x, forceBufferFlags);
            }
            else {
                // This is an off-diagonal tile.
@@ -249,19 +227,62 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
                }
+            }
+        }
+        
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
+        
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.
+
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.

-            // Write results
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.

-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_value[offset1] += value;
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_value[offset2] += local_value[get_local_id(0)];
-            releaseBuffer(y, forceBufferFlags);
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_value[offset] += value;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_value[offset] += local_value[get_local_id(0)];
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
        }
        lasty = y;
        pos++;
-    }
+    } while (pos < end);
 }
--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -15,7 +15,7 @@ typedef struct {
 */

 __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
-        __local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po

 __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
        __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
-        __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else

--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
@@ -16,7 +16,7 @@ typedef struct {

 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
-        __local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else
@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
        __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
-        __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else

--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
@@ -11,30 +11,11 @@ typedef struct {
    float bornForce;
 } AtomData;

-/**
- * Mark that a block in the force buffer is in use.
- */
-void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
-            ;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-/**
- * Mark that a block in the force buffer is no longer in use.
- */
-void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
-}
-
 /**
 * Compute the Born sum.
 */
 __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
-        __local AtomData* localData, __local float* tempBuffer,  __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else
@@ -51,10 +32,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
    unsigned int end = (warp+1)*numTiles/totalWarps;
 #endif
    unsigned int lasty = 0xFFFFFFFF;
+    __local int2 reservedBlocks[WARPS_PER_GROUP];
    
-    while (pos < end) {
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float bornSum = 0.0f;
+        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {
                ushort2 tileIndices = tiles[pos];
@@ -71,14 +58,12 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
                }
            }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
            unsigned int atom1 = x*TILE_SIZE + tgx;
-        float bornSum = 0.0f;
            float4 posq1 = posq[atom1];
            float2 params1 = global_params[atom1];
-        if (x == y) {
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
                // This tile is on the diagonal.

                localData[get_local_id(0)].x = posq1.x;
@@ -117,13 +102,6 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
                        }
                    }
                }
-
-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_bornSum[offset] += bornSum;
-            releaseBuffer(x, forceBufferFlags);
            }
            else {
                // This is an off-diagonal tile.
@@ -261,21 +239,64 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
                }
+            }
+        }
        
-            // Write results
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
        
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_bornSum[offset1] += bornSum;
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
-            releaseBuffer(y, forceBufferFlags);
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.
+
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.
+
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.
+
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_bornSum[offset] += bornSum;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_bornSum[offset] += localData[get_local_id(0)].bornSum;
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
        }
        lasty = y;
        pos++;
-    }
+    } while (pos < end);
 }

 /**
@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po

 __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
        __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
-        __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
+        __local AtomData* localData, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else
@@ -302,10 +323,16 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
+    __local int2 reservedBlocks[WARPS_PER_GROUP];
    
-    while (pos < end) {
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float4 force = 0.0f;
+        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {
                ushort2 tileIndices = tiles[pos];
@@ -322,11 +349,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
                }
            }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
            unsigned int atom1 = x*TILE_SIZE + tgx;
-        float4 force = 0.0f;
            float4 posq1 = posq[atom1];
            float bornRadius1 = global_bornRadii[atom1];
            if (x == y) {
@@ -372,14 +395,6 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                        force.xyz -= delta.xyz;
                    }
                }
-
-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset].xyz += force.xyz;
-            global_bornForce[offset] += force.w;
-            releaseBuffer(x, forceBufferFlags);
            }
            else {
                // This is an off-diagonal tile.
@@ -511,22 +526,65 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
                }
+            }
+        }
        
-            // Write results
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
        
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            global_bornForce[offset1] += force.w;
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
-            global_bornForce[offset2] += localData[get_local_id(0)].fw;
-            releaseBuffer(y, forceBufferFlags);
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.
+
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.
+
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.
+
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset].xyz += force.xyz;
+                        global_bornForce[offset] += force.w;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+                        global_bornForce[offset] += localData[get_local_id(0)].fw;
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
        }
        lasty = y;
        pos++;
-    }
+    } while (pos < end);
    energyBuffer[get_global_id(0)] += energy;
 }
--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -13,7 +13,7 @@ typedef struct {

 __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
-        unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
+        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
@@ -14,7 +14,7 @@ typedef struct {
 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
 void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
-        unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
+        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
@@ -8,31 +8,12 @@ typedef struct {
    ATOM_PARAMETER_DATA
 } AtomData;

-/**
- * Mark that a block in the force buffer is in use.
- */
-void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
-            ;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-/**
- * Mark that a block in the force buffer is no longer in use.
- */
-void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
-        forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
-}
-
 /**
 * Compute nonbonded interactions.
 */
 __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
-        unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
+        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else
@@ -53,10 +34,16 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
    unsigned int lasty = 0xFFFFFFFF;
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];
+    __local int2* reservedBlocks = (__local int2*) exclusionRange;
    
-    while (pos < end) {
+    do {
        // Extract the coordinates of this tile
+        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        const unsigned int tbx = get_local_id(0) - tgx;
+        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
+        float4 force = 0.0f;
+        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {
                ushort2 tileIndices = tiles[pos];
@@ -73,11 +60,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
                }
            }
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
            unsigned int atom1 = x*TILE_SIZE + tgx;
-        float4 force = 0.0f;
            float4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS

@@ -95,7 +78,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
 #else
            bool hasExclusions = false;
 #endif
-        if (x == y) {
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
                // This tile is on the diagonal.

                const unsigned int localAtomIndex = get_local_id(0);
@@ -138,15 +123,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
 #else
                    force.xyz -= dEdR1.xyz;
 #endif
+#ifdef USE_EXCLUSIONS
                    excl >>= 1;
+#endif
                }
-
-            // Write results
-
-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset].xyz += force.xyz;
-            releaseBuffer(x, forceBufferFlags);
            }
            else {
                // This is an off-diagonal tile.
@@ -297,20 +277,63 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
                }
+            }
+        }
+        
+        // Write results.  We need to coordinate between warps to make sure no two of them
+        // ever try to write to the same piece of memory at the same time.
        
-            // Write results
+        int writeX = (pos < end ? x : -1);
+        int writeY = (pos < end && x != y ? y : -1);
+        if (tgx == 0)
+            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
+        bool done = false;
+        int doneIndex = 0;
+        int checkIndex = 0;
+        while (true) {
+            // See if any warp still needs to write its data.

-            reserveBuffer(x, forceBufferFlags);
-            unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            releaseBuffer(x, forceBufferFlags);
-            reserveBuffer(y, forceBufferFlags);
-            unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-            forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
-            releaseBuffer(y, forceBufferFlags);
+            bool allDone = true;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            while (doneIndex < WARPS_PER_GROUP && allDone) {
+                if (reservedBlocks[doneIndex].x != -1)
+                    allDone = false;
+                else
+                    doneIndex++;
+            }
+            if (allDone)
+                break;
+            if (!done) {
+                // See whether this warp can write its data.  This requires that no previous warp
+                // is trying to write to the same block of the buffer.
+
+                bool canWrite = (writeX != -1);
+                while (checkIndex < localGroupIndex && canWrite) {
+                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
+                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
+                        canWrite = false;
+                    else
+                        checkIndex++;
+                }
+                if (canWrite) {
+                    // Write the data to global memory, then mark this warp as done.
+
+                    if (writeX > -1) {
+                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset].xyz += force.xyz;
+                    }
+                    if (writeY > -1) {
+                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                        forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+                    }
+                    done = true;
+                    if (tgx == 0)
+                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
+                }
+            }
        }
        lasty = y;
        pos++;
-    }
+    } while (pos < end);
    energyBuffer[get_global_id(0)] += energy;
 }