On Fermi, store forces as 64 bit fixed point and update with atomic...

On Fermi, store forces as 64 bit fixed point and update with atomic operations. Also various other minor optimizations.

On Fermi, store forces as 64 bit fixed point and update with atomic...
On Fermi, store forces as 64 bit fixed point and update with atomic operations. Also various other minor optimizations.
d9029f61 · Peter Eastman · 8ade176c · d9029f61 · d9029f61 · d9029f61
Commit d9029f61 authored Sep 02, 2011 by Peter Eastman
15 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -61,7 +61,8 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i

 OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::PlatformData& platformData) :
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), posq(NULL), velm(NULL),
-        forceBuffers(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), nonbonded(NULL), thread(NULL) {
+        forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
+        nonbonded(NULL), thread(NULL) {
    try {
        contextIndex = platformData.contexts.size();
        std::vector<cl::Platform> platforms;
@@ -148,6 +149,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
    clearThreeBuffersKernel = cl::Kernel(utilities, "clearThreeBuffers");
    clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
    reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer");
+    reduceForcesKernel = cl::Kernel(utilities, "reduceForces");

    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.

@@ -195,6 +197,8 @@ OpenCLContext::~OpenCLContext() {
        delete force;
    if (forceBuffers != NULL)
        delete forceBuffers;
+    if (longForceBuffer != NULL)
+        delete longForceBuffer;
    if (energyBuffer != NULL)
        delete energyBuffer;
    if (atomIndex != NULL)
@@ -215,6 +219,14 @@ void OpenCLContext::initialize(const System& system) {
    for (int i = 0; i < (int) forces.size(); i++)
        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
    forceBuffers = new OpenCLArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
+    if (supports64BitGlobalAtomics) {
+        longForceBuffer = new OpenCLArray<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer", false);
+        reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
+        reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
+        reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
+        reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
+        addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
+    }
    addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
    force = new OpenCLArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
    energyBuffer = new OpenCLArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer", true);
@@ -328,7 +340,7 @@ void OpenCLContext::clearBuffer(OpenCLArray<mm_float4>& array) {
 void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
    clearBufferKernel.setArg<cl::Memory>(0, memory);
    clearBufferKernel.setArg<cl_int>(1, size);
-    executeKernel(clearBufferKernel, size);
+    executeKernel(clearBufferKernel, size, 128);
 }

 void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
@@ -348,7 +360,7 @@ void OpenCLContext::clearAutoclearBuffers() {
        clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
        clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
        clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
-        executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]));
+        executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
        base += 4;
    }
    if (total-base == 3) {
@@ -358,26 +370,33 @@ void OpenCLContext::clearAutoclearBuffers() {
        clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
        clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
        clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
-        executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]));
+        executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
    }
    else if (total-base == 2) {
        clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
        clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
        clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
        clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
-        executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]));
+        executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
    }
    else if (total-base == 1) {
        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
    }
 }

+void OpenCLContext::reduceForces() {
+    if (supports64BitGlobalAtomics)
+        executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
+    else
+        reduceBuffer(*forceBuffers, numForceBuffers);
+}
+
 void OpenCLContext::reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers) {
    int bufferSize = array.getSize()/numBuffers;
    reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
    reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
    reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
-    executeKernel(reduceFloat4Kernel, bufferSize);
+    executeKernel(reduceFloat4Kernel, bufferSize, 128);
 }

 void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -215,6 +215,12 @@ public:
    OpenCLArray<mm_float4>& getForceBuffers() {
        return *forceBuffers;
    }
+    /**
+     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
+     */
+    OpenCLArray<cl_long>& getLongForceBuffer() {
+        return *longForceBuffer;
+    }
    /**
     * Get the array which contains the buffer in which energy is computed.
     */
@@ -311,6 +317,10 @@ public:
     * @param numBuffers  the number of buffers packed into the array
     */
    void reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers);
+    /**
+     * Sum the buffesr containing forces.
+     */
+    void reduceForces();
    /**
     * Get the current simulation time.
     */
@@ -463,6 +473,7 @@ private:
    cl::Kernel clearThreeBuffersKernel;
    cl::Kernel clearFourBuffersKernel;
    cl::Kernel reduceFloat4Kernel;
+    cl::Kernel reduceForcesKernel;
    std::vector<OpenCLForceInfo*> forces;
    std::vector<MoleculeGroup> moleculeGroups;
    std::vector<mm_int4> posCellOffsets;
@@ -470,6 +481,7 @@ private:
    OpenCLArray<mm_float4>* velm;
    OpenCLArray<mm_float4>* force;
    OpenCLArray<mm_float4>* forceBuffers;
+    OpenCLArray<cl_long>* longForceBuffer;
    OpenCLArray<cl_float>* energyBuffer;
    OpenCLArray<cl_int>* atomIndex;
    std::vector<cl::Memory*> autoclearBuffers;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -572,7 +572,8 @@ private:
 class OpenCLCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
 public:
    OpenCLCalcGBSAOBCForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcGBSAOBCForceKernel(name, platform), cl(cl),
-            hasCreatedKernels(false), params(NULL), bornSum(NULL), bornRadii(NULL), bornForce(NULL), obcChain(NULL) {
+            hasCreatedKernels(false), params(NULL), bornSum(NULL), longBornSum(NULL), bornRadii(NULL), bornForce(NULL),
+            longBornForce(NULL), obcChain(NULL) {
    }
    ~OpenCLCalcGBSAOBCForceKernel();
    /**
@@ -598,8 +599,10 @@ private:
    OpenCLContext& cl;
    OpenCLArray<mm_float2>* params;
    OpenCLArray<cl_float>* bornSum;
+    OpenCLArray<cl_long>* longBornSum;
    OpenCLArray<cl_float>* bornRadii;
    OpenCLArray<cl_float>* bornForce;
+    OpenCLArray<cl_long>* longBornForce;
    OpenCLArray<cl_float>* obcChain;
    cl::Kernel computeBornSumKernel;
    cl::Kernel reduceBornSumKernel;
@@ -613,8 +616,8 @@ private:
 class OpenCLCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
 public:
    OpenCLCalcCustomGBForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcCustomGBForceKernel(name, platform),
-            hasInitializedKernels(false), cl(cl), params(NULL), computedValues(NULL), energyDerivs(NULL), globals(NULL), valueBuffers(NULL),
-            tabulatedFunctionParams(NULL), system(system) {
+            hasInitializedKernels(false), cl(cl), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
+            valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
    }
    ~OpenCLCalcCustomGBForceKernel();
    /**
@@ -635,13 +638,15 @@ public:
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
 private:
    bool hasInitializedKernels, needParameterGradient;
-    int maxTiles;
+    int maxTiles, numComputedValues;
    OpenCLContext& cl;
    OpenCLParameterSet* params;
    OpenCLParameterSet* computedValues;
    OpenCLParameterSet* energyDerivs;
+    OpenCLArray<cl_long>* longEnergyDerivs;
    OpenCLArray<cl_float>* globals;
    OpenCLArray<cl_float>* valueBuffers;
+    OpenCLArray<cl_long>* longValueBuffers;
    OpenCLArray<mm_float4>* tabulatedFunctionParams;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -48,9 +48,16 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
        numForceBuffers = numForceThreadBlocks;
    }
    else if (context.getSIMDWidth() == 32) {
-        numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-        forceThreadBlockSize = 128;
-        numForceBuffers = numForceThreadBlocks;
+        if (context.getSupports64BitGlobalAtomics()) {
+            numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+            forceThreadBlockSize = 256;
+            numForceBuffers = 2;
+        }
+        else {
+            numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+            forceThreadBlockSize = 128;
+            numForceBuffers = numForceThreadBlocks;
+        }
    }
    else {
        numForceThreadBlocks = context.getNumThreadBlocks();
@@ -282,7 +289,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(6, blockBoundingBox->getDeviceBuffer());
            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(8, interactionCount->getDeviceBuffer());
-            findInteractionsWithinBlocksKernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_uint), NULL);
+            findInteractionsWithinBlocksKernel.setArg(9, 128*sizeof(cl_uint), NULL);
            findInteractionsWithinBlocksKernel.setArg<cl_uint>(10, interactingTiles->getSize());
        }
    }
@@ -312,7 +319,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
    if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
        findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
        findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
-        context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms());
+        context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128);
    }
 }

@@ -492,7 +499,10 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    // Set arguments to the Kernel.

    int index = 0;
-    kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
+    if (context.getSupports64BitGlobalAtomics())
+        kernel.setArg<cl::Memory>(index++, context.getLongForceBuffer().getDeviceBuffer());
+    else
+        kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());

--- a/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#define TILE_SIZE 32
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0xFFFFFFFF));
+#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0xFFFFFFFF));
+#else
 #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
 #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
+#endif
+#define TILE_SIZE 32

 /**
 * Compute a force based on pair interactions.
 */
-__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
+__kernel void computeN2Energy(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* forceBuffers,
+#else
+        __global float4* forceBuffers,
+#endif
+        __global float* energyBuffer, __local float4* local_force,
 	__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
@@ -194,6 +206,22 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
        // Write results.  We need to coordinate between warps to make sure no two of them
        // ever try to write to the same piece of memory at the same time.
        
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (force.x*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0xFFFFFFFF));
+            STORE_DERIVATIVES_1
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0xFFFFFFFF));
+            STORE_DERIVATIVES_2
+        }
+#else
        int writeX = (pos < end ? x : -1);
        int writeY = (pos < end && x != y ? y : -1);
        if (tgx == 0)
@@ -245,6 +273,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
                }
            }
        }
+#endif
        pos++;
    } while (pos < end);
    energyBuffer[get_global_id(0)] += energy;

--- a/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
 #define TILE_SIZE 32

 /**
 * Compute a value based on pair interactions.
 */
 __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
-        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
-        __local float* tempBuffer,
+        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* global_value,
+#else
+        __global float* global_value,
+#endif
+        __local float* local_value, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
 #else
@@ -164,16 +172,10 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,

                                // Sum the forces on atom2.

-                                if (tgx % 2 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
                                if (tgx % 4 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
-                                if (tgx % 8 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
-                                if (tgx % 16 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
                                if (tgx == 0)
-                                    local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
+                                    local_value[tbx+j] += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
                            }
                        }
                    }
@@ -233,6 +235,16 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
        // Write results.  We need to coordinate between warps to make sure no two of them
        // ever try to write to the same piece of memory at the same time.
        
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atom_add(&global_value[offset], (long) (value*0xFFFFFFFF));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atom_add(&global_value[offset], (long) (local_value[get_local_id(0)]*0xFFFFFFFF));
+        }
+#else
        int writeX = (pos < end ? x : -1);
        int writeY = (pos < end && x != y ? y : -1);
        if (tgx == 0)
@@ -282,6 +294,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
                }
            }
        }
+#endif
        lasty = y;
        pos++;
    } while (pos < end);

--- a/platforms/opencl/src/kernels/customGBValuePerParticle.cl
+++ b/platforms/opencl/src/kernels/customGBValuePerParticle.cl
@@ -2,17 +2,26 @@
 * Reduce a pairwise computed value, and compute per-particle values.
 */

-__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global float* valueBuffers, __global float4* posq
+__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global float4* posq,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* valueBuffers
+#else
+        __global float* valueBuffers
+#endif
        PARAMETER_ARGUMENTS) {
    unsigned int index = get_global_id(0);
    while (index < NUM_ATOMS) {
        // Reduce the pairwise value

+#ifdef SUPPORTS_64_BIT_ATOMICS
+        float sum = (1.0f/0xFFFFFFFF)*valueBuffers[index];
+#else
        int totalSize = bufferSize*numBuffers;
        float sum = valueBuffers[index];
        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
            sum += valueBuffers[i];
-
+#endif
+        
        // Now calculate other values

        float4 pos = posq[index];

--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
@@ -267,9 +267,9 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB

            float4 delta = apos-center;
 #ifdef USE_PERIODIC
-            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
            delta = max((float4) 0.0f, fabs(delta)-boxSize);
            int thread = get_local_id(0);
@@ -278,31 +278,16 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
            // Sum the flags.

 #ifdef WARPS_ARE_ATOMIC
-            if (index % 2 == 0)
-                flags[thread] += flags[thread+1];
            if (index % 4 == 0)
-                flags[thread] += flags[thread+2];
-            if (index % 8 == 0)
-                flags[thread] += flags[thread+4];
-            if (index % 16 == 0)
-                flags[thread] += flags[thread+8];
+                flags[thread] += flags[thread+1]+flags[thread+2]+flags[thread+3];
 #else
            barrier(CLK_LOCAL_MEM_FENCE);
-            if (index % 2 == 0)
-                flags[thread] += flags[thread+1];
-            barrier(CLK_LOCAL_MEM_FENCE);
            if (index % 4 == 0)
-                flags[thread] += flags[thread+2];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (index % 8 == 0)
-                flags[thread] += flags[thread+4];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (index % 16 == 0)
-                flags[thread] += flags[thread+8];
+                flags[thread] += flags[thread+1]+flags[thread+2]+flags[thread+3];
            barrier(CLK_LOCAL_MEM_FENCE);
 #endif
            if (index == 0) {
-                unsigned int allFlags = flags[thread] + flags[thread+16];
+                unsigned int allFlags = flags[thread]+flags[thread+4]+flags[thread+8]+flags[thread+12]+flags[thread+16]+flags[thread+20]+flags[thread+24]+flags[thread+28];

                // Count how many flags are set, and based on that decide whether to compute all interactions
                // or only a fraction of them.

--- a/platforms/opencl/src/kernels/gbsaObcReductions.cl
+++ b/platforms/opencl/src/kernels/gbsaObcReductions.cl
@@ -7,15 +7,24 @@
 */

 __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
-            __global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            __global long* bornSum,
+#else
+            __global float* bornSum,
+#endif
+            __global float2* params, __global float* bornRadii, __global float* obcChain) {
    unsigned int index = get_global_id(0);
    while (index < NUM_ATOMS) {
        // Get summed Born data

        int totalSize = bufferSize*numBuffers;
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        float sum = (1.0f/0xFFFFFFFF)*bornSum[index];
+#else
        float sum = bornSum[index];
        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
            sum += bornSum[i];
+#endif

        // Now calculate Born radius and OBC term.

@@ -38,18 +47,24 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
 * Reduce the Born force.
 */

-__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
-            __global float2* params, __global float* bornRadii, __global float* obcChain) {
+__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            __global long* bornForceIn,
+#endif
+            __global float* energyBuffer, __global float2* params, __global float* bornRadii, __global float* obcChain) {
    float energy = 0.0f;
    unsigned int index = get_global_id(0);
    while (index < NUM_ATOMS) {
        // Sum the Born force

        int totalSize = bufferSize*numBuffers;
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        float force = (1.0f/0xFFFFFFFF)*bornForceIn[index];
+#else
        float force = bornForce[index];
        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
            force += bornForce[i];
-
+#endif
        // Now calculate the actual force

        float offsetRadius = params[index].x;

--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
 #define TILE_SIZE 32

 typedef struct {
@@ -14,7 +17,13 @@ typedef struct {
 /**
 * Compute the Born sum.
 */
-__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
+__kernel void computeBornSum(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* global_bornSum,
+#else
+        __global float* global_bornSum,
+#endif
+        __global float4* posq, __global float2* global_params,
        __local AtomData* localData, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
@@ -174,16 +183,10 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po

                                // Sum the forces on atom j.

-                                if (tgx % 2 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
                                if (tgx % 4 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
-                                if (tgx % 8 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
-                                if (tgx % 16 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
                                if (tgx == 0)
-                                    localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
+                                    localData[tbx+j].bornSum += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
                            }
                        }
                    }
@@ -245,6 +248,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
        // Write results.  We need to coordinate between warps to make sure no two of them
        // ever try to write to the same piece of memory at the same time.
        
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atom_add(&global_bornSum[offset], (long) (bornSum*0xFFFFFFFF));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0xFFFFFFFF));
+        }
+#else
        int writeX = (pos < end ? x : -1);
        int writeY = (pos < end && x != y ? y : -1);
        if (tgx == 0)
@@ -294,6 +307,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
                }
            }
        }
+#endif
        lasty = y;
        pos++;
    } while (pos < end);
@@ -303,8 +317,13 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
 * First part of computing the GBSA interaction.
 */

-__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
-        __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
+__kernel void computeGBSAForce1(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* forceBuffers, __global long* global_bornForce,
+#else
+        __global float4* forceBuffers, __global float* global_bornForce,
+#endif
+        __global float* energyBuffer, __global float4* posq, __global float* global_bornRadii,
        __local AtomData* localData, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
@@ -460,16 +479,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e

                                // Sum the forces on atom j.

-                                if (tgx % 2 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
                                if (tgx % 4 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
-                                if (tgx % 8 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
-                                if (tgx % 16 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
+                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
                                if (tgx == 0) {
-                                    float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
+                                    float4 sum = tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
                                    localData[tbx+j].fx += sum.x;
                                    localData[tbx+j].fy += sum.y;
                                    localData[tbx+j].fz += sum.z;
@@ -532,6 +545,22 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
        // Write results.  We need to coordinate between warps to make sure no two of them
        // ever try to write to the same piece of memory at the same time.
        
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (force.x*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0xFFFFFFFF));
+            atom_add(&global_bornForce[offset], (long) (force.w*0xFFFFFFFF));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0xFFFFFFFF));
+            atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0xFFFFFFFF));
+        }
+#else
        int writeX = (pos < end ? x : -1);
        int writeY = (pos < end && x != y ? y : -1);
        if (tgx == 0)
@@ -583,6 +612,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                }
            }
        }
+#endif
        lasty = y;
        pos++;
    } while (pos < end);

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
 #define TILE_SIZE 32

 typedef struct {
@@ -11,7 +14,13 @@ typedef struct {
 /**
 * Compute nonbonded interactions.
 */
-__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
+__kernel void computeNonbonded(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* forceBuffers,
+#else
+        __global float4* forceBuffers,
+#endif
+        __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
@@ -157,6 +166,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                            if ((flags&(1<<j)) != 0) {
                                bool isExcluded = false;
                                int atom2 = tbx+j;
+                                int bufferIndex = 3*get_local_id(0);
+#ifdef USE_SYMMETRIC
+                                float dEdR = 0.0f;
+#else
+                                float4 dEdR1 = (float4) 0.0f;
+                                float4 dEdR2 = (float4) 0.0f;
+#endif
+                                float tempEnergy = 0.0f;
                                float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
                                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
@@ -165,20 +182,18 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
                                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                                float invR = RSQRT(r2);
-                                float r = RECIP(invR);
-                                LOAD_ATOM2_PARAMETERS
-                                atom2 = y*TILE_SIZE+j;
-#ifdef USE_SYMMETRIC
-                                float dEdR = 0.0f;
-#else
-                                float4 dEdR1 = (float4) 0.0f;
-                                float4 dEdR2 = (float4) 0.0f;
+#ifdef USE_CUTOFF
+                                if (r2 < CUTOFF_SQUARED) {
+#endif
+                                    float invR = RSQRT(r2);
+                                    float r = RECIP(invR);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    COMPUTE_INTERACTION
+                                    energy += tempEnergy;
+#ifdef USE_CUTOFF
+                                }
 #endif
-                                float tempEnergy = 0.0f;
-                                COMPUTE_INTERACTION
-                                energy += tempEnergy;
-                                int bufferIndex = 3*get_local_id(0);
 #ifdef USE_SYMMETRIC
                                delta.xyz *= dEdR;
                                force.xyz -= delta.xyz;
@@ -194,30 +209,15 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en

                                // Sum the forces on atom2.

-                                if (tgx % 2 == 0) {
-                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
-                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
-                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
-                                }
                                if (tgx % 4 == 0) {
-                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
-                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
-                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
-                                }
-                                if (tgx % 8 == 0) {
-                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
-                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
-                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
-                                }
-                                if (tgx % 16 == 0) {
-                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
-                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
-                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
                                }
                                if (tgx == 0) {
-                                    localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
-                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
-                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
+                                    localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
+                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
+                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
                                }
                            }
                        }
@@ -246,30 +246,36 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                        float invR = RSQRT(r2);
-                        float r = RECIP(invR);
-                        LOAD_ATOM2_PARAMETERS
-                        atom2 = y*TILE_SIZE+tj;
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                            float invR = RSQRT(r2);
+                            float r = RECIP(invR);
+                            LOAD_ATOM2_PARAMETERS
+                            atom2 = y*TILE_SIZE+tj;
 #ifdef USE_SYMMETRIC
-                        float dEdR = 0.0f;
+                            float dEdR = 0.0f;
 #else
-                        float4 dEdR1 = (float4) 0.0f;
-                        float4 dEdR2 = (float4) 0.0f;
+                            float4 dEdR1 = (float4) 0.0f;
+                            float4 dEdR2 = (float4) 0.0f;
 #endif
-                        float tempEnergy = 0.0f;
-                        COMPUTE_INTERACTION
-                        energy += tempEnergy;
+                            float tempEnergy = 0.0f;
+                            COMPUTE_INTERACTION
+                            energy += tempEnergy;
 #ifdef USE_SYMMETRIC
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
-                        localData[tbx+tj].fx += delta.x;
-                        localData[tbx+tj].fy += delta.y;
-                        localData[tbx+tj].fz += delta.z;
+                            delta.xyz *= dEdR;
+                            force.xyz -= delta.xyz;
+                            localData[tbx+tj].fx += delta.x;
+                            localData[tbx+tj].fy += delta.y;
+                            localData[tbx+tj].fz += delta.z;
 #else
-                        force.xyz -= dEdR1.xyz;
-                        localData[tbx+tj].fx += dEdR2.x;
-                        localData[tbx+tj].fy += dEdR2.y;
-                        localData[tbx+tj].fz += dEdR2.z;
+                            force.xyz -= dEdR1.xyz;
+                            localData[tbx+tj].fx += dEdR2.x;
+                            localData[tbx+tj].fy += dEdR2.y;
+                            localData[tbx+tj].fz += dEdR2.z;
+#endif
+#ifdef USE_CUTOFF
+                        }
 #endif
 #ifdef USE_EXCLUSIONS
                        excl >>= 1;
@@ -283,6 +289,20 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
        // Write results.  We need to coordinate between warps to make sure no two of them
        // ever try to write to the same piece of memory at the same time.
        
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (force.x*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0xFFFFFFFF));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0xFFFFFFFF));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0xFFFFFFFF));
+        }
+#else
        int writeX = (pos < end ? x : -1);
        int writeY = (pos < end && x != y ? y : -1);
        if (tgx == 0)
@@ -332,6 +352,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                }
            }
        }
+#endif
        lasty = y;
        pos++;
    } while (pos < end);

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -2,23 +2,23 @@
 * Fill a buffer with 0.
 */

-__kernel void clearBuffer(__global float* buffer, int size) {
+__kernel void clearBuffer(__global int* buffer, int size) {
    int index = get_global_id(0);
-    __global float4* buffer4 = (__global float4*) buffer;
+    __global int4* buffer4 = (__global int4*) buffer;
    int sizeDiv4 = size/4;
    while (index < sizeDiv4) {
-        buffer4[index] = (float4) (0.0f);
+        buffer4[index] = (int4) 0;
        index += get_global_size(0);
    }
    if (get_global_id(0) == 0)
        for (int i = sizeDiv4*4; i < size; i++)
-            buffer[i] = 0.0f;
+            buffer[i] = 0;
 }

 /**
 * Fill two buffers with 0.
 */
-__kernel void clearTwoBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2) {
+__kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2) {
    clearBuffer(buffer1, size1);
    clearBuffer(buffer2, size2);
 }
@@ -26,7 +26,7 @@ __kernel void clearTwoBuffers(__global float* buffer1, int size1, __global float
 /**
 * Fill three buffers with 0.
 */
-__kernel void clearThreeBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2, __global float* buffer3, int size3) {
+__kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3) {
    clearBuffer(buffer1, size1);
    clearBuffer(buffer2, size2);
    clearBuffer(buffer3, size3);
@@ -35,7 +35,7 @@ __kernel void clearThreeBuffers(__global float* buffer1, int size1, __global flo
 /**
 * Fill four buffers with 0.
 */
-__kernel void clearFourBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2, __global float* buffer3, int size3, __global float* buffer4, int size4) {
+__kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4) {
    clearBuffer(buffer1, size1);
    clearBuffer(buffer2, size2);
    clearBuffer(buffer3, size3);
@@ -58,6 +58,20 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
    }
 }

+/**
+ * Sum the various buffers containing forces.
+ */
+__kernel void reduceForces(__global long* longBuffer, __global float4* buffer, int bufferSize, int numBuffers) {
+    int totalSize = bufferSize*numBuffers;
+    float scale = 1.0f/(float) 0xFFFFFFFF;
+    for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
+        float4 sum = (float4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
+        for (int i = index; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        buffer[index] = sum;
+    }
+}
+
 /**
 * This is called to determine the accuracy of various native functions.
 */

--- a/platforms/opencl/tests/TestOpenCLCustomNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLCustomNonbondedForce.cpp
@@ -373,6 +373,12 @@ void testParallelComputation() {
    vector<Vec3> positions(numParticles);
    for (int i = 0; i < numParticles; i++)
        positions[i] = Vec3(5*genrand_real2(sfmt), 5*genrand_real2(sfmt), 5*genrand_real2(sfmt));
+    for (int i = 0; i < numParticles; ++i)
+        for (int j = 0; j < i; ++j) {
+            Vec3 delta = positions[i]-positions[j];
+            if (delta.dot(delta) < 0.1)
+                force->addExclusion(i, j);
+        }
    VerletIntegrator integrator1(0.01);
    map<string, string> props1;
    props1[OpenCLPlatform::OpenCLDeviceIndex()] = "0";

--- a/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
@@ -691,6 +691,12 @@ void testParallelComputation() {
    vector<Vec3> positions(numParticles);
    for (int i = 0; i < numParticles; i++)
        positions[i] = Vec3(5*genrand_real2(sfmt), 5*genrand_real2(sfmt), 5*genrand_real2(sfmt));
+    for (int i = 0; i < numParticles; ++i)
+        for (int j = 0; j < i; ++j) {
+            Vec3 delta = positions[i]-positions[j];
+            if (delta.dot(delta) < 0.1)
+                force->addException(i, j, 0, 1, 0);
+        }
    VerletIntegrator integrator1(0.01);
    map<string, string> props1;
    props1[OpenCLPlatform::OpenCLDeviceIndex()] = "0";