Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · 93c467b2 · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
20 changed files
--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2013 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "OpenCLSort.h"
+#include "OpenCLKernelSources.h"
+#include <map>
+using namespace OpenMM;
+using namespace std;
+OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
+            dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
+    // Create kernels.
+    std::map<std::string, std::string> replacements;
+    replacements["DATA_TYPE"] = trait->getDataType();
+    replacements["KEY_TYPE"] =  trait->getKeyType();
+    replacements["SORT_KEY"] = trait->getSortKey();
+    replacements["MIN_KEY"] = trait->getMinKey();
+    replacements["MAX_KEY"] = trait->getMaxKey();
+    replacements["MAX_VALUE"] = trait->getMaxValue();
+    replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
+    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
+    shortListKernel = cl::Kernel(program, "sortShortList");
+    computeRangeKernel = cl::Kernel(program, "computeRange");
+    assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
+    computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
+    copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets");
+    sortBucketsKernel = cl::Kernel(program, "sortBuckets");
+    // Work out the work group sizes for various kernels.
+    unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+    int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
+    isShortList = (length <= maxLocalBuffer);
+    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
+        ;
+    positionsKernelSize = rangeKernelSize;
+    sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2);
+    if (rangeKernelSize > length)
+        rangeKernelSize = length;
+    if (sortKernelSize > maxLocalBuffer)
+        sortKernelSize = maxLocalBuffer;
+    unsigned int targetBucketSize = sortKernelSize/2;
+    unsigned int numBuckets = length/targetBucketSize;
+    if (numBuckets < 1)
+        numBuckets = 1;
+    if (positionsKernelSize > numBuckets)
+        positionsKernelSize = numBuckets;
+    // Create workspace arrays.
+    if (!isShortList) {
+        dataRange = new OpenCLArray(context, 2, trait->getKeySize(), "sortDataRange");
+        bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset");
+        bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement");
+        offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket");
+        buckets = new OpenCLArray(context, length, trait->getDataSize(), "buckets");
+    }
+}
+OpenCLSort::~OpenCLSort() {
+    delete trait;
+    if (dataRange != NULL)
+        delete dataRange;
+    if (bucketOfElement != NULL)
+        delete bucketOfElement;
+    if (offsetInBucket != NULL)
+        delete offsetInBucket;
+    if (bucketOffset != NULL)
+        delete bucketOffset;
+    if (buckets != NULL)
+        delete buckets;
+}
+void OpenCLSort::sort(OpenCLArray& data) {
+    if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
+        throw OpenMMException("OpenCLSort called with different data size");
+    if (data.getSize() == 0)
+        return;
+    if (isShortList) {
+        // We can use a simpler sort kernel that does the entire operation at once in local memory.
+        shortListKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        shortListKernel.setArg<cl_uint>(1, dataLength);
+        shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
+        context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
+    }
+    else {
+        // Compute the range of data values.
+        computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        computeRangeKernel.setArg<cl_uint>(1, data.getSize());
+        computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
+        computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL);
+        context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
+        // Assign array elements to buckets.
+        unsigned int numBuckets = bucketOffset->getSize();
+        context.clearBuffer(*bucketOffset);
+        assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        assignElementsKernel.setArg<cl_int>(1, data.getSize());
+        assignElementsKernel.setArg<cl_int>(2, numBuckets);
+        assignElementsKernel.setArg<cl::Buffer>(3, dataRange->getDeviceBuffer());
+        assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset->getDeviceBuffer());
+        assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement->getDeviceBuffer());
+        assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket->getDeviceBuffer());
+        context.executeKernel(assignElementsKernel, data.getSize());
+        // Compute the position of each bucket.
+        computeBucketPositionsKernel.setArg<cl_int>(0, numBuckets);
+        computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset->getDeviceBuffer());
+        computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL);
+        context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize);
+        // Copy the data into the buckets.
+        copyToBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        copyToBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
+        copyToBucketsKernel.setArg<cl_int>(2, data.getSize());
+        copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
+        copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement->getDeviceBuffer());
+        copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket->getDeviceBuffer());
+        context.executeKernel(copyToBucketsKernel, data.getSize());
+        // Sort each bucket.
+        sortBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
+        sortBucketsKernel.setArg<cl_int>(2, numBuckets);
+        sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
+        sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL);
+        context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
+    }
+}
--- a/platforms/opencl/src/OpenCLSort.h
+++ b/platforms/opencl/src/OpenCLSort.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2010-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -28,9 +28,7 @@
 * -------------------------------------------------------------------------- */
 #include "OpenCLArray.h"
-#include "OpenCLKernelSources.h"
 #include "windowsExportOpenCL.h"
-#include <map>
 namespace OpenMM {
@@ -38,26 +36,19 @@ namespace OpenMM {
 * This class sorts arrays of values.  It supports any type of values, not just scalars,
 * so long as an appropriate sorting key can be defined by which to sort them.
 * 
- * The class is templatized by a "trait" class that defines the type of data to
+ * The sorting behavior is specified by a "trait" class that defines the type of data to
 * sort and the key for sorting it.  Here is an example of a trait class for
 * sorting floats:
 * 
- * struct FloatTrait {
+ * class FloatTrait : public OpenCLSort::SortTrait {
- *     // The name of the data and key types being sorted.
+ *     int getDataSize() const {return 4;}
- *     // Both the host type and OpenCL type is required.
+ *     int getKeySize() const {return 4;}
- *     // For primitive types they will be the same.
+ *     const char* getDataType() const {return "float";}
- *     typedef cl_float DataType;
+ *     const char* getKeyType() const {return "float";}
- *     typedef cl_float KeyType;
+ *     const char* getMinKey() const {return "-MAXFLOAT";}
- *     static const char* clDataType() {return "float";}
+ *     const char* getMaxKey() const {return "MAXFLOAT";}
- *     static const char* clKeyType() {return "float";}
+ *     const char* getMaxValue() const {return "MAXFLOAT";}
- *     // The minimum value a key can take.
+ *     const char* getSortKey() const {return "value";}
- *     static const char* clMinKey() {return "-MAXFLOAT";}
- *     // The maximum value a key can take.
- *     static const char* clMaxKey() {return "MAXFLOAT";}
- *     // A value whose key is guaranteed to equal clMaxKey().
- *     static const char* clMaxValue() {return "MAXFLOAT";}
- *     // The OpenCL code to select the key from the data value.
- *     static const char* clSortKey() {return "value";}
 * };
 *
 * The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
@@ -74,139 +65,76 @@ namespace OpenMM {
 * elements).
 */
-template <class TRAIT>
+class OPENMM_EXPORT_OPENCL OpenCLSort {
-class OpenCLSort {
 public:
+    class SortTrait;
    /**
     * Create an OpenCLSort object for sorting data of a particular type.
     *
     * @param context    the context in which to perform calculations
+     * @param trait      a SortTrait defining the type of data to sort.  It should have been allocated
+     *                   on the heap with the "new" operator.  This object takes over ownership of it,
+     *                   and deletes it when the OpenCLSort is deleted.
     * @param length     the length of the arrays this object will be used to sort
     */
-    OpenCLSort(OpenCLContext& context, unsigned int length) : context(context),
+    OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length);
-            dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
+    ~OpenCLSort();
-        // Create kernels.
-        std::map<std::string, std::string> replacements;
-        replacements["DATA_TYPE"] = TRAIT::clDataType();
-        replacements["KEY_TYPE"] =  TRAIT::clKeyType();
-        replacements["SORT_KEY"] = TRAIT::clSortKey();
-        replacements["MIN_KEY"] = TRAIT::clMinKey();
-        replacements["MAX_KEY"] = TRAIT::clMaxKey();
-        replacements["MAX_VALUE"] = TRAIT::clMaxValue();
-        replacements["VALUE_IS_INT2"] = (TRAIT::clDataType() == std::string("int2") ? "1" : "0");
-        cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
-        computeRangeKernel = cl::Kernel(program, "computeRange");
-        assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
-        computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
-        copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets");
-        sortBucketsKernel = cl::Kernel(program, "sortBuckets");
-        // Work out the work group sizes for various kernels.
-        unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
-        for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
-            ;
-        positionsKernelSize = rangeKernelSize;
-        sortKernelSize = rangeKernelSize/2;
-        if (rangeKernelSize > length)
-            rangeKernelSize = length;
-        unsigned int maxLocalBuffer = (unsigned int) ((context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(typename TRAIT::DataType))/2);
-        if (sortKernelSize > maxLocalBuffer)
-            sortKernelSize = maxLocalBuffer;
-        unsigned int targetBucketSize = sortKernelSize/2;
-        unsigned int numBuckets = length/targetBucketSize;
-        if (numBuckets < 1)
-            numBuckets = 1;
-        if (positionsKernelSize > numBuckets)
-            positionsKernelSize = numBuckets;
-        // Create workspace arrays.
-        dataRange = OpenCLArray::create<typename TRAIT::KeyType>(context, 2, "sortDataRange");
-        bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset");
-        bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement");
-        offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket");
-        buckets = OpenCLArray::create<typename TRAIT::DataType>(context, length, "buckets");
-    }
-    ~OpenCLSort() {
-        if (dataRange != NULL)
-            delete dataRange;
-        if (bucketOfElement != NULL)
-            delete bucketOfElement;
-        if (offsetInBucket != NULL)
-            delete offsetInBucket;
-        if (bucketOffset != NULL)
-            delete bucketOffset;
-        if (buckets != NULL)
-            delete buckets;
-    }
    /**
     * Sort an array.
     */
-    void sort(OpenCLArray& data) {
+    void sort(OpenCLArray& data);
-        if (data.getSize() != bucketOfElement->getSize())
-            throw OpenMMException("OpenCLSort called with different data size");
-        if (data.getSize() == 0)
-            return;
-        // Compute the range of data values.
-        computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        computeRangeKernel.setArg<cl_uint>(1, data.getSize());
-        computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
-        computeRangeKernel.setArg(3, rangeKernelSize*sizeof(typename TRAIT::KeyType), NULL);
-        context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
-        // Assign array elements to buckets.
-        unsigned int numBuckets = bucketOffset->getSize();
-        context.clearBuffer(*bucketOffset);
-        assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        assignElementsKernel.setArg<cl_int>(1, data.getSize());
-        assignElementsKernel.setArg<cl_int>(2, numBuckets);
-        assignElementsKernel.setArg<cl::Buffer>(3, dataRange->getDeviceBuffer());
-        assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset->getDeviceBuffer());
-        assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement->getDeviceBuffer());
-        assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket->getDeviceBuffer());
-        context.executeKernel(assignElementsKernel, data.getSize());
-        // Compute the position of each bucket.
-        computeBucketPositionsKernel.setArg<cl_int>(0, numBuckets);
-        computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset->getDeviceBuffer());
-        computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL);
-        context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize);
-        // Copy the data into the buckets.
-        copyToBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        copyToBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
-        copyToBucketsKernel.setArg<cl_int>(2, data.getSize());
-        copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
-        copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement->getDeviceBuffer());
-        copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket->getDeviceBuffer());
-        context.executeKernel(copyToBucketsKernel, data.getSize());
-        // Sort each bucket.
-        sortBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
-        sortBucketsKernel.setArg<cl_int>(2, numBuckets);
-        sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
-        sortBucketsKernel.setArg(4, sortKernelSize*sizeof(typename TRAIT::DataType), NULL);
-        context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
-    }
 private:
    OpenCLContext& context;
+    SortTrait* trait;
    OpenCLArray* dataRange;
    OpenCLArray* bucketOfElement;
    OpenCLArray* offsetInBucket;
    OpenCLArray* bucketOffset;
    OpenCLArray* buckets;
-    cl::Kernel computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
+    cl::Kernel shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
-    unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
+    unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
+    bool isShortList;
+};
+/**
+ * A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
+ */
+class OpenCLSort::SortTrait {
+public:
+    virtual ~SortTrait() {
+    }
+    /**
+     * Get the size of each data value in bytes.
+     */
+    virtual int getDataSize() const = 0;
+    /**
+     * Get the size of each key value in bytes.
+     */
+    virtual int getKeySize() const = 0;
+    /**
+     * Get the data type of the values to sort.
+     */
+    virtual const char* getDataType() const = 0;
+    /**
+     * Get the data type of the sorting key.
+     */
+    virtual const char* getKeyType() const = 0;
+    /**
+     * Get the minimum value a key can take.
+     */
+    virtual const char* getMinKey() const = 0;
+    /**
+     * Get the maximum value a key can take.
+     */
+    virtual const char* getMaxKey() const = 0;
+    /**
+     * Get a value whose key is guaranteed to equal getMaxKey().
+     */
+    virtual const char* getMaxValue() const = 0;
+    /**
+     * Get the CUDA code to select the key from the data value.
+     */
+    virtual const char* getSortKey() const = 0;
 };
 } // namespace OpenMM

--- a/platforms/opencl/src/kernels/andersenThermostat.cl
+++ b/platforms/opencl/src/kernels/andersenThermostat.cl
@@ -4,14 +4,14 @@
 __kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global mixed4* velm, __global const mixed2* restrict stepSize, __global const float4* restrict random,
        unsigned int randomIndex, __global const int* restrict atomGroups) {
-    float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y);
+    float collisionProbability = 1.0f-EXP(-collisionFrequency*stepSize[0].y);
-    float randomRange = erf(collisionProbability/sqrt(2.0f));
+    float randomRange = erf(collisionProbability/SQRT(2.0f));
    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
        mixed4 velocity = velm[index];
        float4 selectRand = random[randomIndex+atomGroups[index]];
        float4 velRand = random[randomIndex+index];
        real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1);
-        real add = (1-scale)*sqrt(kT*velocity.w);
+        real add = (1-scale)*SQRT(kT*velocity.w);
        velocity.x = scale*velocity.x + add*velRand.x;
        velocity.y = scale*velocity.y + add*velRand.y;
        velocity.z = scale*velocity.z + add*velRand.z;

--- a/platforms/opencl/src/kernels/brownian.cl
+++ b/platforms/opencl/src/kernels/brownian.cl
@@ -8,9 +8,9 @@ __kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __gl
    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
        mixed invMass = velm[index].w;
        if (invMass != 0) {
-            posDelta[index] = (mixed4) (tauDeltaT*invMass*force[index].x + noiseAmplitude*sqrt(invMass)*random[randomIndex].x,
+            posDelta[index] = (mixed4) (tauDeltaT*invMass*force[index].x + noiseAmplitude*SQRT(invMass)*random[randomIndex].x,
-                                        tauDeltaT*invMass*force[index].y + noiseAmplitude*sqrt(invMass)*random[randomIndex].y,
+                                        tauDeltaT*invMass*force[index].y + noiseAmplitude*SQRT(invMass)*random[randomIndex].y,
-                                        tauDeltaT*invMass*force[index].z + noiseAmplitude*sqrt(invMass)*random[randomIndex].z, 0);
+                                        tauDeltaT*invMass*force[index].z + noiseAmplitude*SQRT(invMass)*random[randomIndex].z, 0);
        }
        randomIndex += get_global_size(0);
    }

--- a/platforms/opencl/src/kernels/ccma.cl
+++ b/platforms/opencl/src/kernels/ccma.cl
@@ -10,7 +10,8 @@ mixed4 loadPos(__global const real4* restrict posq, __global const real4* restri
 /**
 * Compute the direction each constraint is pointing in.  This is called once at the beginning of constraint evaluation.
 */
-__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global mixed4* restrict constraintDistance, __global const real4* restrict atomPositions, __global const real4* restrict posCorrection) {
+__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global mixed4* restrict constraintDistance,
+        __global const real4* restrict atomPositions, __global const real4* restrict posCorrection, __global int* restrict converged) {
    for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
        // Compute the direction for this constraint.
@@ -23,6 +24,10 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
        dir.z = oldPos1.z-oldPos2.z;
        constraintDistance[index] = dir;
    }
+    if (get_global_id(0) == 0) {
+        converged[0] = 1;
+        converged[1] = 0;
+    }
 }
 /**

--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
 #if USE_EWALD
-bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
+bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
 if (!isExcluded || needCorrection) {
-    real tempForce = 0;
    if (r2 < CUTOFF_SQUARED || needCorrection) {
        const real alphaR = EWALD_ALPHA*r;
        const real expAlphaRSqr = EXP(-alphaR*alphaR);
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
        t *= t;
        t *= t;
        const real erfcAlphaR = RECIP(t*t);
+        real tempForce = 0;
        if (needCorrection) {
            // Subtract off the part of this interaction that was included in the reciprocal space contribution.
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
            tempEnergy += prefactor*erfcAlphaR;
 #endif
        }
+        dEdR += tempForce*invR*invR;
    }
-    dEdR += tempForce*invR*invR;
 }
 #else
 {

--- a/platforms/opencl/src/kernels/customGBEnergyN2.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2.cl
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
+#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0x100000000));
+#else
+#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
+#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
+#endif
+/**
+ * Compute a force based on pair interactions.
+ */
+__kernel void computeN2Energy(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict forceBuffers,
+#else
+        __global real4* restrict forceBuffers,
+#endif
+        __global real* restrict energyBuffer, __local real4* restrict local_force,
+	__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
+        __global const ushort2* exclusionTiles,
+#ifdef USE_CUTOFF
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    const unsigned int warp = get_global_id(0)/TILE_SIZE;
+    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+    const unsigned int tbx = get_local_id(0) - tgx;
+    real energy = 0;
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        real4 force = 0;
+        DECLARE_ATOM1_DERIVATIVES
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
+#endif
+        if (x == y) {
+            // This tile is on the diagonal.
+            const unsigned int localAtomIndex = get_local_id(0);
+            local_posq[localAtomIndex] = posq1;
+            LOAD_LOCAL_PARAMETERS_FROM_1
+            SYNC_WARPS;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+j;
+                real4 posq2 = local_posq[atom2];
+                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    real dEdR = 0;
+                    real tempEnergy = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                        COMPUTE_INTERACTION
+                        dEdR /= -r;
+                    }
+                    energy += 0.5f*tempEnergy;
+                    delta.xyz *= dEdR;
+                    force.xyz -= delta.xyz;
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                SYNC_WARPS;
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            const unsigned int localAtomIndex = get_local_id(0);
+            unsigned int j = y*TILE_SIZE + tgx;
+            local_posq[localAtomIndex] = posq[j];
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            local_force[localAtomIndex] = 0;
+            CLEAR_LOCAL_DERIVATIVES
+            SYNC_WARPS;
+#ifdef USE_EXCLUSIONS
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+tj;
+                real4 posq2 = local_posq[atom2];
+                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+tj;
+                    real dEdR = 0;
+                    real tempEnergy = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        COMPUTE_INTERACTION
+                        dEdR /= -r;
+                    }
+                    energy += tempEnergy;
+                    delta.xyz *= dEdR;
+                    force.xyz -= delta.xyz;
+                    atom2 = tbx+tj;
+                    local_force[atom2].xyz += delta.xyz;
+                    RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                tj = (tj + 1) & (TILE_SIZE - 1);
+                SYNC_WARPS;
+            }
+        }
+        // Write results.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        unsigned int offset = x*TILE_SIZE + tgx;
+        atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
+        atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+        atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+        STORE_DERIVATIVES_1
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0x100000000));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
+            STORE_DERIVATIVES_2
+        }
+#else
+        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        unsigned int offset = offset1;
+        forceBuffers[offset1].xyz += force.xyz;
+        STORE_DERIVATIVES_1
+        if (x != y) {
+            offset = offset2;
+            forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f);
+            STORE_DERIVATIVES_2
+        }
+#endif
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    int pos = warp*numTiles/totalWarps;
+    int end = (warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
+    __local int skipTiles[FORCE_WORK_GROUP_SIZE];
+    skipTiles[get_local_id(0)] = -1;
+    while (pos < end) {
+        const bool isExcluded = false;
+        real4 force = 0;
+        DECLARE_ATOM1_DERIVATIVES
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            SYNC_WARPS;
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                SYNC_WARPS;
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[get_local_id(0)] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+                SYNC_WARPS;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = get_local_id(0);
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[get_local_id(0)] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                local_posq[localAtomIndex] = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                local_force[localAtomIndex] = 0;
+                CLEAR_LOCAL_DERIVATIVES
+            }
+            SYNC_WARPS;
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                local_posq[get_local_id(0)].x -= floor((local_posq[get_local_id(0)].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                local_posq[get_local_id(0)].y -= floor((local_posq[get_local_id(0)].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                local_posq[get_local_id(0)].z -= floor((local_posq[get_local_id(0)].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                SYNC_WARPS;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = local_posq[atom2];
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    if (r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real dEdR = 0;
+                        real tempEnergy = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        atom2 = tbx+tj;
+                        local_force[atom2].xyz += delta.xyz;
+                        RECORD_DERIVATIVE_2
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = local_posq[atom2];
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real dEdR = 0;
+                        real tempEnergy = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        atom2 = tbx+tj;
+                        local_force[atom2].xyz += delta.xyz;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            // Write results.
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[get_local_id(0)];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+            atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+            unsigned int offset = atom1;
+            STORE_DERIVATIVES_1
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atom_add(&forceBuffers[atom2], (long) (local_force[get_local_id(0)].x*0x100000000));
+                atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
+                atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
+                offset = atom2;
+                STORE_DERIVATIVES_2
+            }
+#else
+            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
+            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
+            forceBuffers[offset1].xyz += force.xyz;
+            unsigned int offset = offset1;
+            STORE_DERIVATIVES_1
+            if (atom2 < PADDED_NUM_ATOMS) {
+                forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f);
+                offset = offset2;
+                STORE_DERIVATIVES_2
+            }
+#endif
+        }
+        pos++;
+    }
+    energyBuffer[get_global_id(0)] += energy;
+}
--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
-#define TILE_SIZE 32
+#ifdef SUPPORTS_64_BIT_ATOMICS
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1;
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[tgx];
+#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
+#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[tgx]*0x100000000));
+#else
+#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
+#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
+#endif
 /**
 * Compute a force based on pair interactions.
 */
+__kernel void computeN2Energy(
-__kernel void computeN2Energy(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __local real4* restrict local_force,
+#ifdef SUPPORTS_64_BIT_ATOMICS
-	__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
+        __global long* restrict forceBuffers,
-        __global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempBuffer,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
 #else
-        unsigned int numTiles
+        __global real4* restrict forceBuffers,
 #endif
-        PARAMETER_ARGUMENTS) {
+        __global real* restrict energyBuffer, __local real4* restrict local_force,
+	__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
+        __global const ushort2* exclusionTiles,
 #ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms
-    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
 #else
-    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+        unsigned int numTiles
-    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
+        PARAMETER_ARGUMENTS) {
    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
-    while (pos < end) {
+    // First loop: process tiles that contain exclusions.
-        // Extract the coordinates of this tile
-        unsigned int x, y;
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-#ifdef USE_CUTOFF
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-        if (numTiles <= maxTiles) {
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-            ushort2 tileIndices = tiles[pos];
+        const ushort2 tileIndices = exclusionTiles[pos];
-            x = tileIndices.x;
+        const unsigned int x = tileIndices.x;
-            y = tileIndices.y;
+        const unsigned int y = tileIndices.y;
-        }
-        else
-#endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
-        // Locate the exclusion data for this tile.
+        // Load the data for this tile.
-#ifdef USE_EXCLUSIONS
+        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
-        unsigned int exclusionStart = exclusionRowIndices[x];
+            unsigned int j = y*TILE_SIZE + localAtomIndex;
-        unsigned int exclusionEnd = exclusionRowIndices[x+1];
+            local_posq[localAtomIndex] = posq[j];
-        int exclusionIndex = -1;
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-        for (int i = exclusionStart; i < exclusionEnd; i++)
-            if (exclusionIndices[i] == y) {
-                exclusionIndex = i*TILE_SIZE;
-                break;
-            }
-        bool hasExclusions = (exclusionIndex > -1);
-#else
-        bool hasExclusions = false;
-#endif
-        // Load the data for this tile if we don't already have it cached.
-        if (lasty != y) {
-            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
-                unsigned int j = y*TILE_SIZE + localAtomIndex;
-                local_posq[localAtomIndex] = posq[j];
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            }
        }
        if (x == y) {
            // This tile is on the diagonal.
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
 #ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex+tgx];
+                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
 #endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real4 force = 0;
@@ -84,9 +56,6 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
                real4 posq1 = posq[atom1];
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
                    real4 posq2 = local_posq[j];
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
@@ -96,20 +65,23 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
 #ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
 #endif
-                    real invR = RSQRT(r2);
+                        real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                        real r = RECIP(invR);
-                    unsigned int atom2 = j;
+                        unsigned int atom2 = j;
-                    LOAD_ATOM2_PARAMETERS
+                        LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
+                        atom2 = y*TILE_SIZE+j;
-                    real dEdR = 0;
+                        real dEdR = 0;
-                    real tempEnergy = 0;
+                        real tempEnergy = 0;
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#ifdef USE_EXCLUSIONS
-                        COMPUTE_INTERACTION
+                        bool isExcluded = !(excl & 0x1);
-                        dEdR /= -r;
+#endif
-                    }
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-                    energy += 0.5f*tempEnergy;
+                            COMPUTE_INTERACTION
-                    delta.xyz *= dEdR;
+                            dEdR /= -r;
-                    force.xyz -= delta.xyz;
+                        }
+                        energy += 0.5f*tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
 #ifdef USE_CUTOFF
                    }
 #endif
@@ -118,11 +90,19 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
 #endif
                }
-                // Write results
+                // Write results.
-                unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef SUPPORTS_64_BIT_ATOMICS
-                forceBuffers[offset1].xyz += force.xyz;
+                unsigned int offset = atom1;
+                atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
+                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
                STORE_DERIVATIVES_1
+#else
+                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                forceBuffers[offset].xyz += force.xyz;
+                STORE_DERIVATIVES_1
+#endif
            }
        }
        else {
@@ -132,60 +112,212 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
                local_force[localAtomIndex] = 0;
                CLEAR_LOCAL_DERIVATIVES
            }
-#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
+            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
-            unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
+#ifdef USE_EXCLUSIONS
-            unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
+                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
-            if (!hasExclusions && (flags1 != 0xFFFFFFFF || flags2 != 0xFFFFFFFF)) {
+#endif
-                // Compute only a subset of the interactions in this tile.
+                unsigned int atom1 = x*TILE_SIZE+tgx;
+                real4 force = 0;
-                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                DECLARE_ATOM1_DERIVATIVES
-                    if ((flags2&(1<<tgx)) != 0) {
+                real4 posq1 = posq[atom1];
-                        unsigned int atom1 = x*TILE_SIZE+tgx;
+                LOAD_ATOM1_PARAMETERS
-                        real value = 0;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        DECLARE_ATOM1_DERIVATIVES
+                    real4 posq2 = local_posq[j];
-                        real4 posq1 = posq[atom1];
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-                        LOAD_ATOM1_PARAMETERS
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                real4 posq2 = local_posq[j];
-                                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-                                real r2 = dot(delta.xyz, delta.xyz);
+                    real r2 = dot(delta.xyz, delta.xyz);
-                                if (r2 < CUTOFF_SQUARED) {
+#ifdef USE_CUTOFF
-                                    real invR = RSQRT(r2);
+                    if (r2 < CUTOFF_SQUARED) {
-                                    real r = RECIP(invR);
+#endif
-                                    unsigned int atom2 = j;
+                        real invR = RSQRT(r2);
-                                    LOAD_ATOM2_PARAMETERS
+                        real r = RECIP(invR);
-                                    atom2 = y*TILE_SIZE+j;
+                        unsigned int atom2 = j;
-                                    real dEdR = 0;
+                        LOAD_ATOM2_PARAMETERS
-                                    real tempEnergy = 0;
+                        atom2 = y*TILE_SIZE+j;
-                                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        real dEdR = 0;
-                                        COMPUTE_INTERACTION
+                        real tempEnergy = 0;
-                                        dEdR /= -r;
+#ifdef USE_EXCLUSIONS
-                                    }
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
-                                    energy += tempEnergy;
+                        if (!isExcluded) {
-                                    delta.xyz *= dEdR;
+#else
-                                    force.xyz -= delta.xyz;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                                    atom2 = j;
+#endif
-                                    local_force[atom2].xyz += delta.xyz;
+                            COMPUTE_INTERACTION
-                                    RECORD_DERIVATIVE_2
+                            dEdR /= -r;
-                                }
-                            }
                        }
+                        energy += tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        atom2 = j;
+                        local_force[atom2].xyz += delta.xyz;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                    }
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
+                // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                unsigned int offset = atom1;
+                atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
+                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                STORE_DERIVATIVES_1
+#else
+                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                forceBuffers[offset].xyz += force.xyz;
+                STORE_DERIVATIVES_1
+#endif
+            }
+            // Write results.
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                unsigned int offset = y*TILE_SIZE+tgx;
+                atom_add(&forceBuffers[offset], (long) (local_force[tgx].x*0x100000000));
+                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
+                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
+                STORE_DERIVATIVES_2
+#else
+                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                forceBuffers[offset].xyz += local_force[tgx].xyz;
+                STORE_DERIVATIVES_2
+#endif
+            }
+        }
+    }
-                        // Write results for atom1.
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
-                        unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef USE_CUTOFF
-                        global_value[offset] += value;
+    const unsigned int numTiles = interactionCount[0];
+    int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+    int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+#else
+    int pos = get_group_id(0)*numTiles/get_num_groups(0);
+    int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
+#endif
+    int nextToSkip = -1;
+    int currentSkipIndex = 0;
+    __local int atomIndices[TILE_SIZE];
+    while (pos < end) {
+        const bool isExcluded = false;
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (nextToSkip < pos) {
+                if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[currentSkipIndex++];
+                    nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    nextToSkip = end;
+            }
+            includeTile = (nextToSkip != pos);
+        }
+        if (includeTile) {
+            // Load the data for this tile.
+            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+#ifdef USE_CUTOFF
+                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+#else
+                unsigned int j = y*TILE_SIZE+localAtomIndex;
+#endif
+                atomIndices[localAtomIndex] = j;
+                if (j < PADDED_NUM_ATOMS) {
+                    local_posq[localAtomIndex] = posq[j];
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                    local_force[localAtomIndex] = 0;
+                    CLEAR_LOCAL_DERIVATIVES
+                }
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
+                    local_posq[tgx].xyz -= floor((local_posq[tgx].xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
+                    real4 force = 0;
+                    DECLARE_ATOM1_DERIVATIVES
+                    real4 posq1 = posq[atom1];
+                    LOAD_ATOM1_PARAMETERS
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                        real4 posq2 = local_posq[j];
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real r2 = dot(delta.xyz, delta.xyz);
+                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            unsigned int atom2 = j;
+                            LOAD_ATOM2_PARAMETERS
+                            atom2 = atomIndices[j];
+                            real dEdR = 0;
+                            real tempEnergy = 0;
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                            energy += tempEnergy;
+                            delta.xyz *= dEdR;
+                            force.xyz -= delta.xyz;
+                            atom2 = j;
+                            local_force[atom2].xyz += delta.xyz;
+                            RECORD_DERIVATIVE_2
+                        }
                    }
+                    // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    unsigned int offset = atom1;
+                    atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
+                    atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                    atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                    STORE_DERIVATIVES_1
+#else
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    forceBuffers[offset].xyz += force.xyz;
+                    STORE_DERIVATIVES_1
+#endif
                }
            }
            else
 #endif
            {
-                // Compute the full set of interactions in this tile.
+                // We need to apply periodic boundary conditions separately for each interaction.
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
@@ -193,13 +325,7 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
                    DECLARE_ATOM1_DERIVATIVES
                    real4 posq1 = posq[atom1];
                    LOAD_ATOM1_PARAMETERS
-#ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
-#endif
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
                        real4 posq2 = local_posq[j];
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
@@ -207,50 +333,67 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
 #endif
                        real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
+                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+#else
+                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS) {
 #endif
-                        real invR = RSQRT(r2);
+                            real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                            real r = RECIP(invR);
-                        unsigned int atom2 = j;
+                            unsigned int atom2 = j;
-                        LOAD_ATOM2_PARAMETERS
+                            LOAD_ATOM2_PARAMETERS
-                        atom2 = y*TILE_SIZE+j;
+                            atom2 = atomIndices[j];
-                        real dEdR = 0;
+                            real dEdR = 0;
-                        real tempEnergy = 0;
+                            real tempEnergy = 0;
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
+                            energy += tempEnergy;
+                            delta.xyz *= dEdR;
+                            force.xyz -= delta.xyz;
+                            atom2 = j;
+                            local_force[atom2].xyz += delta.xyz;
+                            RECORD_DERIVATIVE_2
                        }
-                        energy += tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
-                        atom2 = j;
-                        local_force[atom2].xyz += delta.xyz;
-                        RECORD_DERIVATIVE_2
-#ifdef USE_CUTOFF
-                        }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
-#endif
                    }
                    // Write results for atom1.
-                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef SUPPORTS_64_BIT_ATOMICS
-                    forceBuffers[offset1].xyz += force.xyz;
+                    unsigned int offset = atom1;
+                    atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
+                    atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                    atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                    STORE_DERIVATIVES_1
+#else
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    forceBuffers[offset].xyz += force.xyz;
                    STORE_DERIVATIVES_1
+#endif
                }
            }
-            // Write results
+            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef USE_CUTOFF
-                forceBuffers[offset2].xyz += local_force[tgx].xyz;
+                unsigned int atom2 = atomIndices[tgx];
-                STORE_DERIVATIVES_2
+#else
+                unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+                if (atom2 < PADDED_NUM_ATOMS) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&forceBuffers[atom2], (long) (local_force[tgx].x*0x100000000));
+                    atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
+                    atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
+                    unsigned int offset = atom2;
+                    STORE_DERIVATIVES_2
+#else
+                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    forceBuffers[offset].xyz += local_force[tgx].xyz;
+                    STORE_DERIVATIVES_2
+#endif
+                }
            }
        }
-        lasty = y;
        pos++;
    }
    energyBuffer[get_global_id(0)] += energy;

--- a/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
-#define TILE_SIZE 32
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset1+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
-#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset2+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0x100000000));
-#else
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1+tempDerivBuffer##INDEX[get_local_id(0)+TILE_SIZE];
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)]+local_deriv##INDEX[get_local_id(0)+TILE_SIZE];
-#endif
-/**
- * Compute a force based on pair interactions.
- */
-__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
-void computeN2Energy(
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers,
-#else
-        __global real4* restrict forceBuffers,
-#endif
-        __global real* restrict energyBuffer, __local real4* restrict local_force,
-        __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
-        __global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempForceBuffer,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles
-#else
-        unsigned int numTiles
-#endif
-        PARAMETER_ARGUMENTS) {
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-#else
-    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
-#endif
-    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
-    __local unsigned int exclusionRange[2];
-    __local int exclusionIndex[1];
-    DECLARE_TEMP_BUFFERS
-    while (pos < end) {
-        // Extract the coordinates of this tile
-        unsigned int x, y;
-#ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
-#endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
-        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
-        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 force = 0;
-        DECLARE_ATOM1_DERIVATIVES
-        real4 posq1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
-        // Locate the exclusion data for this tile.
-#ifdef USE_EXCLUSIONS
-        if (get_local_id(0) < 2)
-            exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
-        if (tgx == 0)
-            exclusionIndex[0] = -1;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
-            if (exclusionIndices[i] == y)
-                exclusionIndex[0] = i*TILE_SIZE;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        bool hasExclusions = (exclusionIndex[0] > -1);
-#endif
-        if (x == y) {
-            // This tile is on the diagonal.
-            const unsigned int localAtomIndex = get_local_id(0);
-            local_posq[localAtomIndex] = posq1;
-            LOAD_LOCAL_PARAMETERS_FROM_1
-            barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef USE_EXCLUSIONS
-            unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
-#endif
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-#ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
-#endif
-                int atom2 = baseLocalAtom+j;
-                real4 posq2 = local_posq[atom2];
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+baseLocalAtom+j;
-                real dEdR = 0;
-                real tempEnergy = 0;
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-                    COMPUTE_INTERACTION
-                    dEdR /= -r;
-                }
-                energy += 0.5f*tempEnergy;
-                delta.xyz *= dEdR;
-                force.xyz -= delta.xyz;
-#ifdef USE_CUTOFF
-                }
-#endif
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-            }
-            // Sum the forces and write results.
-            if (get_local_id(0) >= TILE_SIZE) {
-                tempForceBuffer[get_local_id(0)] = force;
-                SET_TEMP_BUFFERS
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset1 = x*TILE_SIZE + tgx;
-                atom_add(&forceBuffers[offset1], (long) ((force.x + tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0x100000000));
-                atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0x100000000));
-                atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                forceBuffers[offset1].xyz += force.xyz + tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
-#endif
-                STORE_DERIVATIVES_1
-            }
-        }
-        else {
-            // This is an off-diagonal tile.
-            const unsigned int localAtomIndex = get_local_id(0);
-            if (lasty != y && get_local_id(0) < TILE_SIZE) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                local_posq[localAtomIndex] = posq[j];
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            }
-            local_force[localAtomIndex] = 0;
-            CLEAR_LOCAL_DERIVATIVES
-            barrier(CLK_LOCAL_MEM_FENCE);
-            // Compute the full set of interactions in this tile.
-#ifdef USE_EXCLUSIONS
-            unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
-            excl = (excl >> baseLocalAtom) & 0xFFFF;
-            excl += excl << 16;
-            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-#endif
-            unsigned int tj = tgx%(TILE_SIZE/2);
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-#ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
-#endif
-                int atom2 = baseLocalAtom+tj;
-                real4 posq2 = local_posq[atom2];
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+baseLocalAtom+tj;
-                real dEdR = 0;
-                real tempEnergy = 0;
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    COMPUTE_INTERACTION
-                    dEdR /= -r;
-                }
-                energy += tempEnergy;
-                delta.xyz *= dEdR;
-                force.xyz -= delta.xyz;
-                atom2 = baseLocalAtom+tj+forceBufferOffset;
-                local_force[baseLocalAtom+tj+forceBufferOffset].xyz += delta.xyz;
-                RECORD_DERIVATIVE_2
-#ifdef USE_CUTOFF
-                }
-#endif
-                barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-                tj = (tj+1)%(TILE_SIZE/2);
-            }
-            // Sum the forces and write results.
-            if (get_local_id(0) >= TILE_SIZE) {
-                tempForceBuffer[get_local_id(0)] = force;
-                SET_TEMP_BUFFERS
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset1 = x*TILE_SIZE + tgx;
-                const unsigned int offset2 = y*TILE_SIZE + tgx;
-                atom_add(&forceBuffers[offset1], (long) ((force.x+tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0x100000000));
-                atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y+tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0x100000000));
-                atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z+tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0x100000000));
-                atom_add(&forceBuffers[offset2], (long) ((local_force[get_local_id(0)].x+local_force[get_local_id(0)+TILE_SIZE].x)*0x100000000));
-                atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].y+local_force[get_local_id(0)+TILE_SIZE].y)*0x100000000));
-                atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].z+local_force[get_local_id(0)+TILE_SIZE].z)*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
-                forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
-#endif
-                STORE_DERIVATIVES_1
-                STORE_DERIVATIVES_2
-            }
-        }
-        lasty = y;
-        pos++;
-    }
-    energyBuffer[get_global_id(0)] += energy;
-}
--- a/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
-#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0x100000000));
-#else
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
-#endif
-#define TILE_SIZE 32
-/**
- * Compute a force based on pair interactions.
- */
-__kernel void computeN2Energy(
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers,
-#else
-        __global real4* restrict forceBuffers,
-#endif
-        __global real* restrict energyBuffer, __local real4* restrict local_force,
-	__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
-        __global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempBuffer,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
-#else
-        unsigned int numTiles
-#endif
-        PARAMETER_ARGUMENTS) {
-    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    unsigned int warp = get_global_id(0)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
-    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
-    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __local int exclusionIndex[WARPS_PER_GROUP];
-    __local int2* reservedBlocks = (__local int2*) exclusionRange;
-    do {
-        // Extract the coordinates of this tile
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int x, y;
-        real4 force = 0;
-        DECLARE_ATOM1_DERIVATIVES
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-            real4 posq1 = posq[atom1];
-            LOAD_ATOM1_PARAMETERS
-            // Locate the exclusion data for this tile.
-#ifdef USE_EXCLUSIONS
-            if (tgx < 2)
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-#else
-            bool hasExclusions = false;
-#endif
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-                const unsigned int localAtomIndex = get_local_id(0);
-                local_posq[localAtomIndex] = posq1;
-                LOAD_LOCAL_PARAMETERS_FROM_1
-#ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
-#endif
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
-                    int atom2 = tbx+j;
-                    real4 posq2 = local_posq[atom2];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
-                    real dEdR = 0;
-                    real tempEnergy = 0;
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-                        COMPUTE_INTERACTION
-                        dEdR /= -r;
-                    }
-                    energy += 0.5f*tempEnergy;
-                    delta.xyz *= dEdR;
-                    force.xyz -= delta.xyz;
-#ifdef USE_CUTOFF
-                    }
-#endif
-#ifdef USE_EXCLUSIONS
-                    excl >>= 1;
-#endif
-                }
-            }
-            else {
-                // This is an off-diagonal tile.
-                const unsigned int localAtomIndex = get_local_id(0);
-                if (lasty != y) {
-                    unsigned int j = y*TILE_SIZE + tgx;
-                    local_posq[localAtomIndex] = posq[j];
-                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                }
-                local_force[localAtomIndex] = 0;
-                CLEAR_LOCAL_DERIVATIVES
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                if (!hasExclusions && flags == 0) {
-                    // No interactions in this tile.
-                }
-                else
-#endif
-                {
-                    // Compute the full set of interactions in this tile.
-#ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-#endif
-                    unsigned int tj = tgx;
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
-                        int atom2 = tbx+tj;
-                        real4 posq2 = local_posq[atom2];
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
-                        LOAD_ATOM2_PARAMETERS
-                        atom2 = y*TILE_SIZE+tj;
-                        real dEdR = 0;
-                        real tempEnergy = 0;
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            COMPUTE_INTERACTION
-                            dEdR /= -r;
-                        }
-                        energy += tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
-                        atom2 = tbx+tj;
-                        local_force[atom2].xyz += delta.xyz;
-                        RECORD_DERIVATIVE_2
-#ifdef USE_CUTOFF
-                        }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
-#endif
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                }
-            }
-        }
-        lasty = y;
-        // Write results.  We need to coordinate between warps to make sure no two of them
-        // ever try to write to the same piece of memory at the same time.
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
-            atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-            STORE_DERIVATIVES_1
-        }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
-            atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0x100000000));
-            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
-            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
-            STORE_DERIVATIVES_2
-        }
-#else
-        int writeX = (pos < end ? x : -1);
-        int writeY = (pos < end && x != y ? y : -1);
-        if (tgx == 0)
-            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
-        bool done = false;
-        int doneIndex = 0;
-        int checkIndex = 0;
-        while (true) {
-            // See if any warp still needs to write its data.
-            bool allDone = true;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            while (doneIndex < WARPS_PER_GROUP && allDone) {
-                if (reservedBlocks[doneIndex].x != -1)
-                    allDone = false;
-                else
-                    doneIndex++;
-            }
-            if (allDone)
-                break;
-            if (!done) {
-                // See whether this warp can write its data.  This requires that no previous warp
-                // is trying to write to the same block of the buffer.
-                bool canWrite = (writeX != -1);
-                while (checkIndex < localGroupIndex && canWrite) {
-                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
-                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
-                        canWrite = false;
-                    else
-                        checkIndex++;
-                }
-                if (canWrite) {
-                    // Write the data to global memory, then mark this warp as done.
-                    if (writeX > -1) {
-                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        forceBuffers[offset].xyz += force.xyz;
-                        STORE_DERIVATIVES_1
-                    }
-                    if (writeY > -1) {
-                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        forceBuffers[offset].xyz += local_force[get_local_id(0)].xyz;
-                        STORE_DERIVATIVES_2
-                    }
-                    done = true;
-                    if (tgx == 0)
-                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
-                }
-            }
-        }
-#endif
-        pos++;
-    } while (pos < end);
-    energyBuffer[get_global_id(0)] += energy;
-}
--- a/platforms/opencl/src/kernels/customGBValueN2.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2.cl
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
+/**
+ * Compute a value based on pair interactions.
+ */
+__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
+        __global const ushort2* exclusionTiles,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict global_value,
+#else
+        __global real* restrict global_value,
+#endif
+        __local real* restrict local_value,
+#ifdef USE_CUTOFF
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    const unsigned int warp = get_global_id(0)/TILE_SIZE;
+    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+    const unsigned int tbx = get_local_id(0) - tgx;
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        real value = 0;
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
+#endif
+        if (x == y) {
+            // This tile is on the diagonal.
+            const unsigned int localAtomIndex = get_local_id(0);
+            local_posq[localAtomIndex] = posq1;
+            LOAD_LOCAL_PARAMETERS_FROM_1
+            SYNC_WARPS;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+j;
+                real4 posq2 = local_posq[atom2];
+                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    real tempValue1 = 0;
+                    real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                    if (!isExcluded && atom1 != atom2) {
+#else
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#endif
+                        COMPUTE_VALUE
+                    }
+                    value += tempValue1;
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                SYNC_WARPS;
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            const unsigned int localAtomIndex = get_local_id(0);
+            unsigned int j = y*TILE_SIZE + tgx;
+            local_posq[localAtomIndex] = posq[j];
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            local_value[localAtomIndex] = 0;
+            SYNC_WARPS;
+#ifdef USE_EXCLUSIONS
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+tj;
+                real4 posq2 = local_posq[atom2];
+                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+tj;
+                    real tempValue1 = 0;
+                    real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                    if (!isExcluded) {
+#else
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
+                        COMPUTE_VALUE
+                    }
+                    value += tempValue1;
+                    local_value[tbx+tj] += tempValue2;
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                tj = (tj + 1) & (TILE_SIZE - 1);
+                SYNC_WARPS;
+            }
+        }
+        // Write results.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        unsigned int offset = x*TILE_SIZE + tgx;
+        atom_add(&global_value[offset], (long) (value*0x100000000));
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atom_add(&global_value[offset], (long) (local_value[get_local_id(0)]*0x100000000));
+        }
+#else
+        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        global_value[offset1] += value;
+        if (x != y)
+            global_value[offset2] += local_value[get_local_id(0)];
+#endif
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    int pos = warp*numTiles/totalWarps;
+    int end = (warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
+    __local int skipTiles[FORCE_WORK_GROUP_SIZE];
+    skipTiles[get_local_id(0)] = -1;
+    while (pos < end) {
+        real value = 0;
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            SYNC_WARPS;
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                SYNC_WARPS;
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[get_local_id(0)] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+                SYNC_WARPS;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = get_local_id(0);
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[get_local_id(0)] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                local_posq[localAtomIndex] = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                local_value[localAtomIndex] = 0;
+            }
+            SYNC_WARPS;
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                local_posq[get_local_id(0)].x -= floor((local_posq[get_local_id(0)].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                local_posq[get_local_id(0)].y -= floor((local_posq[get_local_id(0)].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                local_posq[get_local_id(0)].z -= floor((local_posq[get_local_id(0)].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                SYNC_WARPS;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = local_posq[atom2];
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    if (r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real tempValue1 = 0;
+                        real tempValue2 = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_VALUE
+                        }
+                        value += tempValue1;
+                        local_value[tbx+tj] += tempValue2;
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = local_posq[atom2];
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real tempValue1 = 0;
+                        real tempValue2 = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_VALUE
+                        }
+                        value += tempValue1;
+                        local_value[tbx+tj] += tempValue2;
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            // Write results.
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[get_local_id(0)];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            atom_add(&global_value[atom1], (long) (value*0x100000000));
+            if (atom2 < PADDED_NUM_ATOMS)
+                atom_add(&global_value[atom2], (long) (local_value[get_local_id(0)]*0x100000000));
+#else
+            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
+            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
+            global_value[offset1] += value;
+            if (atom2 < PADDED_NUM_ATOMS)
+                global_value[offset2] += local_value[get_local_id(0)];
+#endif
+        }
+        pos++;
+    }
+}
--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
-#define TILE_SIZE 32
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
 /**
 * Compute a value based on pair interactions.
 */
 __kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real* restrict global_value, __local real* restrict local_value,
+        __global const ushort2* exclusionTiles,
-        __local real* restrict tempBuffer,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict global_value,
+#else
+        __global real* restrict global_value,
+#endif
+        __local real* restrict local_value,
 #ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        // Load the data for this tile.
+        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+            unsigned int j = y*TILE_SIZE + localAtomIndex;
+            local_posq[localAtomIndex] = posq[j];
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+        }
+        if (x == y) {
+            // This tile is on the diagonal.
+            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
+#endif
+                unsigned int atom1 = x*TILE_SIZE+tgx;
+                real value = 0;
+                real4 posq1 = posq[atom1];
+                LOAD_ATOM1_PARAMETERS
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    real4 posq2 = local_posq[j];
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
+                    if (r2 < CUTOFF_SQUARED) {
-    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+#endif
-    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        unsigned int atom2 = j;
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+j;
+                        real tempValue1 = 0;
+                        real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                        if (!isExcluded && atom1 != atom2) {
 #else
-    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
-    unsigned int lasty = 0xFFFFFFFF;
+                            COMPUTE_VALUE
+                        }
-    while (pos < end) {
+                        value += tempValue1;
-        // Extract the coordinates of this tile
-        unsigned int x, y;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
+                    }
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
 #endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
-        // Locate the exclusion data for this tile.
 #ifdef USE_EXCLUSIONS
-        unsigned int exclusionStart = exclusionRowIndices[x];
+                    excl >>= 1;
-        unsigned int exclusionEnd = exclusionRowIndices[x+1];
-        int exclusionIndex = -1;
-        for (int i = exclusionStart; i < exclusionEnd; i++)
-            if (exclusionIndices[i] == y) {
-                exclusionIndex = i*TILE_SIZE;
-                break;
-            }
-        bool hasExclusions = (exclusionIndex > -1);
-#else
-        bool hasExclusions = false;
 #endif
+                }
-        // Load the data for this tile if we don't already have it cached.
+                // Write results.
-        if (lasty != y) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
-            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+                atom_add(&global_value[atom1], (long) (value*0x100000000));
-                unsigned int j = y*TILE_SIZE + localAtomIndex;
+#else
-                local_posq[localAtomIndex] = posq[j];
+                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                global_value[offset] += value;
+#endif
            }
        }
-        if (x == y) {
+        else {
-            // This tile is on the diagonal.
+            // This is an off-diagonal tile.
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++)
+                local_value[tgx] = 0;
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
 #ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex+tgx];
+                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
 #endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real value = 0;
                real4 posq1 = posq[atom1];
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
                    real4 posq2 = local_posq[j];
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
@@ -92,21 +114,23 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
 #ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
 #endif
-                    real invR = RSQRT(r2);
+                        real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                        real r = RECIP(invR);
-                    unsigned int atom2 = j;
+                        unsigned int atom2 = j;
-                    LOAD_ATOM2_PARAMETERS
+                        LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
+                        atom2 = y*TILE_SIZE+j;
-                    real tempValue1 = 0;
+                        real tempValue1 = 0;
-                    real tempValue2 = 0;
+                        real tempValue2 = 0;
 #ifdef USE_EXCLUSIONS
-                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                        if (!isExcluded) {
 #else
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
 #endif
-                        COMPUTE_VALUE
+                            COMPUTE_VALUE
-                    }
+                        }
-                    value += tempValue1;
+                        value += tempValue1;
+                        local_value[j] += tempValue2;
 #ifdef USE_CUTOFF
                    }
 #endif
@@ -115,78 +139,148 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
 #endif
                }
-                // Write results
+                // Write results for atom1.
-                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                atom_add(&global_value[atom1], (long) (value*0x100000000));
+#else
+                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                global_value[offset] += value;
+#endif
+            }
+            // Write results.
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                unsigned int offset = y*TILE_SIZE+tgx;
+                atom_add(&global_value[offset], (long) (local_value[tgx]*0x100000000));
+#else
+                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                global_value[offset] += local_value[tgx];
+#endif
            }
        }
-        else {
+    }
-            // This is an off-diagonal tile.
-            for (int tgx = 0; tgx < TILE_SIZE; tgx++)
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
-                local_value[tgx] = 0;
+    // of them (no cutoff).
-#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
-            unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
-            unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
-            if (!hasExclusions && (flags1 != 0xFFFFFFFF || flags2 != 0xFFFFFFFF)) {
-                // Compute only a subset of the interactions in this tile.
-                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef USE_CUTOFF
-                    if ((flags2&(1<<tgx)) != 0) {
+    const unsigned int numTiles = interactionCount[0];
-                        unsigned int atom1 = x*TILE_SIZE+tgx;
+    int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-                        real value = 0;
+    int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-                        real4 posq1 = posq[atom1];
+#else
-                        LOAD_ATOM1_PARAMETERS
+    int pos = get_group_id(0)*numTiles/get_num_groups(0);
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+    int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
-                            if ((flags&(1<<j)) != 0) {
+#endif
-                                real4 posq2 = local_posq[j];
+    int nextToSkip = -1;
-                                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+    int currentSkipIndex = 0;
+    __local int atomIndices[TILE_SIZE];
+    while (pos < end) {
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (nextToSkip < pos) {
+                if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[currentSkipIndex++];
+                    nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    nextToSkip = end;
+            }
+            includeTile = (nextToSkip != pos);
+        }
+        if (includeTile) {
+            // Load the data for this tile.
+            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+#ifdef USE_CUTOFF
+                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+#else
+                unsigned int j = y*TILE_SIZE+localAtomIndex;
+#endif
+                atomIndices[localAtomIndex] = j;
+                if (j < PADDED_NUM_ATOMS) {
+                    local_posq[localAtomIndex] = posq[j];
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                    local_value[localAtomIndex] = 0;
+                }
+            }
 #ifdef USE_PERIODIC
-                                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+            if (singlePeriodicCopy) {
-#endif
+                // The box is small enough that we can just translate all the atoms into a single periodic
-                                real r2 = dot(delta.xyz, delta.xyz);
+                // box, then skip having to apply periodic boundary conditions later.
-                                real tempValue1 = 0;
-                                real tempValue2 = 0;
+                real4 blockCenterX = blockCenter[x];
-                                if (r2 < CUTOFF_SQUARED) {
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
-                                    real invR = RSQRT(r2);
+                    local_posq[tgx].xyz -= floor((local_posq[tgx].xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
-                                    real r = RECIP(invR);
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                                    unsigned int atom2 = j;
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
-                                    LOAD_ATOM2_PARAMETERS
+                    real value = 0;
-                                    atom2 = y*TILE_SIZE+j;
+                    real4 posq1 = posq[atom1];
-                                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    LOAD_ATOM1_PARAMETERS
-                                        COMPUTE_VALUE
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                                    }
+                        real4 posq2 = local_posq[j];
-                                    value += tempValue1;
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-                                    local_value[j] += tempValue2;
+                        real r2 = dot(delta.xyz, delta.xyz);
-                                }
+                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-                            }
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            unsigned int atom2 = j;
+                            LOAD_ATOM2_PARAMETERS
+                            atom2 = atomIndices[j];
+                            real tempValue1 = 0;
+                            real tempValue2 = 0;
+                            COMPUTE_VALUE
+                            value += tempValue1;
+                            local_value[j] += tempValue2;
                        }
+                    }
-                        // Write results for atom1.
+                    // Write results for atom1.
-                        unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef SUPPORTS_64_BIT_ATOMICS
-                        global_value[offset] += value;
+                    atom_add(&global_value[atom1], (long) (value*0x100000000));
-                    }
+#else
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    global_value[offset] += value;
+#endif
                }
            }
            else
 #endif
            {
-                // Compute the full set of interactions in this tile.
+                // We need to apply periodic boundary conditions separately for each interaction.
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real value = 0;
                    real4 posq1 = posq[atom1];
                    LOAD_ATOM1_PARAMETERS
-#ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
-#endif
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
                        real4 posq2 = local_posq[j];
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
@@ -194,47 +288,52 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
 #endif
                        real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
+                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
-                        unsigned int atom2 = j;
-                        LOAD_ATOM2_PARAMETERS
-                        atom2 = y*TILE_SIZE+j;
-                        real tempValue1 = 0;
-                        real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                        if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
 #else
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS) {
 #endif
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            unsigned int atom2 = j;
+                            LOAD_ATOM2_PARAMETERS
+                            atom2 = atomIndices[j];
+                            real tempValue1 = 0;
+                            real tempValue2 = 0;
                            COMPUTE_VALUE
+                            value += tempValue1;
+                            local_value[j] += tempValue2;
                        }
-                        value += tempValue1;
-                        local_value[j] += tempValue2;
-#ifdef USE_CUTOFF
-                        }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
-#endif
                    }
                    // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&global_value[atom1], (long) (value*0x100000000));
+#else
                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                    global_value[offset] += value;
+#endif
                }
            }
-            // Write results
+            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef USE_CUTOFF
-                global_value[offset] += local_value[tgx];
+                unsigned int atom2 = atomIndices[tgx];
+#else
+                unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+                if (atom2 < PADDED_NUM_ATOMS) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&global_value[atom2], (long) (local_value[tgx]*0x100000000));
+#else
+                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    global_value[offset] += local_value[tgx];
+#endif
+                }
            }
        }
-        lasty = y;
        pos++;
    }
 }
--- a/platforms/opencl/src/kernels/customGBValueN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_default.cl
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-#define TILE_SIZE 32
-/**
- * Compute a value based on pair interactions.
- */
-__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
-void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict global_value,
-#else
-        __global real* restrict global_value,
-#endif
-        __local real* restrict local_value,
-        __local real* restrict tempBuffer,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles
-#else
-        unsigned int numTiles
-#endif
-        PARAMETER_ARGUMENTS) {
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-#else
-    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
-#endif
-    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
-    __local unsigned int exclusionRange[2];
-    __local int exclusionIndex[1];
-    while (pos < end) {
-        // Extract the coordinates of this tile
-        unsigned int x, y;
-#ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
-#endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
-        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
-        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real value = 0;
-        real4 posq1 = posq[atom1];
-        LOAD_ATOM1_PARAMETERS
-        // Locate the exclusion data for this tile.
-#ifdef USE_EXCLUSIONS
-        if (get_local_id(0) < 2)
-            exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
-        if (tgx == 0)
-            exclusionIndex[0] = -1;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
-            if (exclusionIndices[i] == y)
-                exclusionIndex[0] = i*TILE_SIZE;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        bool hasExclusions = (exclusionIndex[0] > -1);
-#endif
-        if (x == y) {
-            // This tile is on the diagonal.
-            const unsigned int localAtomIndex = get_local_id(0);
-            local_posq[localAtomIndex] = posq1;
-            LOAD_LOCAL_PARAMETERS_FROM_1
-            barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef USE_EXCLUSIONS
-            unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
-#endif
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-#ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
-#endif
-                int atom2 = baseLocalAtom+j;
-                real4 posq2 = local_posq[atom2];
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+baseLocalAtom+j;
-                real tempValue1 = 0;
-                real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-#else
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-#endif
-                    COMPUTE_VALUE
-                }
-                value += tempValue1;
-#ifdef USE_CUTOFF
-                }
-#endif
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-            }
-            // Sum the values and write results.
-            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[get_local_id(0)] = value;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset = x*TILE_SIZE + tgx;
-                atom_add(&global_value[offset], (long) ((value + tempBuffer[get_local_id(0)+TILE_SIZE])*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                global_value[offset] += value + tempBuffer[get_local_id(0)+TILE_SIZE];
-#endif
-            }
-        }
-        else {
-            // This is an off-diagonal tile.
-            if (lasty != y && get_local_id(0) < TILE_SIZE) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                local_posq[get_local_id(0)] = posq[j];
-                const unsigned int localAtomIndex = get_local_id(0);
-                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            }
-            local_value[get_local_id(0)] = 0;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            // Compute the full set of interactions in this tile.
-#ifdef USE_EXCLUSIONS
-            unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
-            excl = (excl >> baseLocalAtom) & 0xFFFF;
-            excl += excl << 16;
-            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-#endif
-            unsigned int tj = tgx%(TILE_SIZE/2);
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-#ifdef USE_EXCLUSIONS
-                bool isExcluded = !(excl & 0x1);
-#endif
-                int atom2 = baseLocalAtom+tj;
-                real4 posq2 = local_posq[atom2];
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
-#endif
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+baseLocalAtom+tj;
-                real tempValue1 = 0;
-                real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#else
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#endif
-                    COMPUTE_VALUE
-                }
-                value += tempValue1;
-                local_value[baseLocalAtom+tj+valueBufferOffset] += tempValue2;
-#ifdef USE_CUTOFF
-                }
-#endif
-                barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
-                tj = (tj+1)%(TILE_SIZE/2);
-            }
-            // Sum the values and write results.
-            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[get_local_id(0)] = value;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset1 = x*TILE_SIZE + tgx;
-                const unsigned int offset2 = y*TILE_SIZE + tgx;
-                atom_add(&global_value[offset1], (long) ((value + tempBuffer[get_local_id(0)+TILE_SIZE])*0x100000000));
-                atom_add(&global_value[offset2], (long) ((local_value[get_local_id(0)] + local_value[get_local_id(0)+TILE_SIZE])*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                global_value[offset1] += value + tempBuffer[get_local_id(0)+TILE_SIZE];
-                global_value[offset2] += local_value[get_local_id(0)] + local_value[get_local_id(0)+TILE_SIZE];
-#endif
-            }
-        }
-        lasty = y;
-        pos++;
-    }
-}
--- a/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-#define TILE_SIZE 32
-/**
- * Compute a value based on pair interactions.
- */
-__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict global_value,
-#else
-        __global real* restrict global_value,
-#endif
-        __local real* restrict local_value, __local real* restrict tempBuffer,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
-#else
-        unsigned int numTiles
-#endif
-        PARAMETER_ARGUMENTS) {
-    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    unsigned int warp = get_global_id(0)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
-    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
-    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __local int exclusionIndex[WARPS_PER_GROUP];
-    __local int2* reservedBlocks = (__local int2*) exclusionRange;
-    do {
-        // Extract the coordinates of this tile
-        const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        const unsigned int tbx = get_local_id(0) - tgx;
-        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
-        unsigned int x, y;
-        real value = 0;
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
-            unsigned int atom1 = x*TILE_SIZE + tgx;
-            real4 posq1 = posq[atom1];
-            LOAD_ATOM1_PARAMETERS
-            // Locate the exclusion data for this tile.
-#ifdef USE_EXCLUSIONS
-            if (tgx < 2)
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-#else
-            bool hasExclusions = false;
-#endif
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
-                // This tile is on the diagonal.
-                const unsigned int localAtomIndex = get_local_id(0);
-                local_posq[localAtomIndex] = posq1;
-                LOAD_LOCAL_PARAMETERS_FROM_1
-#ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
-#endif
-                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
-                    int atom2 = tbx+j;
-                    real4 posq2 = local_posq[atom2];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
-#endif
-                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = y*TILE_SIZE+j;
-                    real tempValue1 = 0;
-                    real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-#else
-                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-#endif
-                        COMPUTE_VALUE
-                    }
-                    value += tempValue1;
-#ifdef USE_CUTOFF
-                    }
-#endif
-#ifdef USE_EXCLUSIONS
-                    excl >>= 1;
-#endif
-                }
-            }
-            else {
-                // This is an off-diagonal tile.
-                if (lasty != y) {
-                    unsigned int j = y*TILE_SIZE + tgx;
-                    local_posq[get_local_id(0)] = posq[j];
-                    const unsigned int localAtomIndex = get_local_id(0);
-                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                }
-                local_value[get_local_id(0)] = 0;
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                if (!hasExclusions && flags != 0xFFFFFFFF) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                int atom2 = tbx+j;
-                                real4 posq2 = local_posq[atom2];
-                                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                                real tempValue1 = 0;
-                                real tempValue2 = 0;
-                                if (r2 < CUTOFF_SQUARED) {
-                                    real invR = RSQRT(r2);
-                                    real r = RECIP(invR);
-                                    LOAD_ATOM2_PARAMETERS
-                                    atom2 = y*TILE_SIZE+j;
-                                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                                        COMPUTE_VALUE
-                                    }
-                                    value += tempValue1;
-                                }
-                                tempBuffer[get_local_id(0)] = tempValue2;
-                                // Sum the forces on atom2.
-                                if (tgx % 4 == 0)
-                                    tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
-                                if (tgx == 0)
-                                    local_value[tbx+j] += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
-                            }
-                        }
-                    }
-                }
-                else
-#endif
-                {
-                    // Compute the full set of interactions in this tile.
-#ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-#endif
-                    unsigned int tj = tgx;
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
-                        int atom2 = tbx+tj;
-                        real4 posq2 = local_posq[atom2];
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
-#endif
-                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
-                        LOAD_ATOM2_PARAMETERS
-                        atom2 = y*TILE_SIZE+tj;
-                        real tempValue1 = 0;
-                        real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                        if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#else
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#endif
-                            COMPUTE_VALUE
-                        }
-                        value += tempValue1;
-                        local_value[tbx+tj] += tempValue2;
-#ifdef USE_CUTOFF
-                        }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
-#endif
-                        tj = (tj + 1) & (TILE_SIZE - 1);
-                    }
-                }
-            }
-        }
-        // Write results.  We need to coordinate between warps to make sure no two of them
-        // ever try to write to the same piece of memory at the same time.
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
-            atom_add(&global_value[offset], (long) (value*0x100000000));
-        }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
-            atom_add(&global_value[offset], (long) (local_value[get_local_id(0)]*0x100000000));
-        }
-#else
-        int writeX = (pos < end ? x : -1);
-        int writeY = (pos < end && x != y ? y : -1);
-        if (tgx == 0)
-            reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
-        bool done = false;
-        int doneIndex = 0;
-        int checkIndex = 0;
-        while (true) {
-            // See if any warp still needs to write its data.
-            bool allDone = true;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            while (doneIndex < WARPS_PER_GROUP && allDone) {
-                if (reservedBlocks[doneIndex].x != -1)
-                    allDone = false;
-                else
-                    doneIndex++;
-            }
-            if (allDone)
-                break;
-            if (!done) {
-                // See whether this warp can write its data.  This requires that no previous warp
-                // is trying to write to the same block of the buffer.
-                bool canWrite = (writeX != -1);
-                while (checkIndex < localGroupIndex && canWrite) {
-                    if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
-                            (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
-                        canWrite = false;
-                    else
-                        checkIndex++;
-                }
-                if (canWrite) {
-                    // Write the data to global memory, then mark this warp as done.
-                    if (writeX > -1) {
-                        const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        global_value[offset] += value;
-                    }
-                    if (writeY > -1) {
-                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        global_value[offset] += local_value[get_local_id(0)];
-                    }
-                    done = true;
-                    if (tgx == 0)
-                        reservedBlocks[localGroupIndex] = (int2)(-1, -1);
-                }
-            }
-        }
-#endif
-        lasty = y;
-        pos++;
-    } while (pos < end);
-}
--- a/platforms/opencl/src/kernels/fft.cl
+++ b/platforms/opencl/src/kernels/fft.cl
@@ -11,14 +11,14 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric
    for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
        w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int index = get_group_id(0); index < XSIZE*YSIZE; index += get_num_groups(0)) {
+    for (int index = get_group_id(0)*BLOCKS_PER_GROUP+get_local_id(0)/ZSIZE; index < XSIZE*YSIZE; index += get_num_groups(0)*BLOCKS_PER_GROUP) {
        int x = index/YSIZE;
        int y = index-x*YSIZE;
 #if LOOP_REQUIRED
        for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
            data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
 #else
-        data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)];
+        data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
 #endif
        barrier(CLK_LOCAL_MEM_FENCE);
        COMPUTE_FFT

--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
-#define TILE_SIZE 32
-#define GROUP_SIZE 64
-#define BUFFER_GROUPS 4
 #define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
+#define WARP_SIZE 32
+#define INVALID 0xFFFF
 /**
 * Find a bounding box for the atoms in each block.
 */
-__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
+__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq,
+        __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
+        __global real2* restrict sortedBlocks) {
    int index = get_global_id(0);
    int base = index*TILE_SIZE;
    while (base < numAtoms) {
        real4 pos = posq[base];
 #ifdef USE_PERIODIC
-        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.xyz -= floor(pos.xyz*invPeriodicBoxSize.xyz)*periodicBoxSize.xyz;
-        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
        real4 firstPoint = pos;
 #endif
        real4 minPos = pos;
@@ -25,146 +24,229 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
 #ifdef USE_PERIODIC
-            pos.x -= floor((pos.x-firstPoint.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            pos.xyz -= floor((pos.xyz-firstPoint.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
-            pos.y -= floor((pos.y-firstPoint.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-            pos.z -= floor((pos.z-firstPoint.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
-        blockBoundingBox[index] = 0.5f*(maxPos-minPos);
+        real4 blockSize = 0.5f*(maxPos-minPos);
+        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
+        sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index);
        index += get_global_size(0);
        base = index*TILE_SIZE;
    }
    if (get_global_id(0) == 0)
-        interactionCount[0] = 0;
+        rebuildNeighborList[0] = 0;
 }
 /**
- * This is called by findBlocksWithInteractions().  It compacts the list of blocks and writes them
+ * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
- * to global memory.
 */
-void storeInteractionData(__local ushort2* buffer, __local int* valid, __local short* sum, __local ushort2* temp, __local int* baseIndex,
+__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
-            __global unsigned int* interactionCount, __global ushort2* interactingTiles, real cutoffSquared, real4 periodicBoxSize,
+        __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
-            real4 invPeriodicBoxSize, __global const real4* posq, __global const real4* blockCenter, __global const real4* blockBoundingBox, unsigned int maxTiles) {
+        __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
-    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.
+        __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList) {
+    for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
+        int index = (int) sortedBlock[i].y;
+        sortedBlockCenter[i] = blockCenter[index];
+        sortedBlockBoundingBox[i] = blockBoundingBox[index];
+    }
+    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
-    for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
+    bool rebuild = false;
-        temp[i].x = (valid[i] ? 1 : 0);
+    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
+        real4 delta = oldPositions[i]-posq[i];
+        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
+            rebuild = true;
+    }
+    if (rebuild) {
+        rebuildNeighborList[0] = 1;
+        interactionCount[0] = 0;
+    }
+}
+/**
+ * Perform a parallel prefix sum over an array.  The input values are all assumed to be 0 or 1.
+ */
+void prefixSum(__local short* sum, __local ushort2* temp) {
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        temp[i].x = sum[i];
    barrier(CLK_LOCAL_MEM_FENCE);
    int whichBuffer = 0;
    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
        if (whichBuffer == 0)
-            for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
+            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
        else
-            for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
+            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
        whichBuffer = 1-whichBuffer;
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (whichBuffer == 0)
-        for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
+        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            sum[i] = temp[i].x;
    else
-        for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
+        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            sum[i] = temp[i].y;
    barrier(CLK_LOCAL_MEM_FENCE);
-    int numValid = sum[BUFFER_SIZE-1];
+}
-    barrier(CLK_LOCAL_MEM_FENCE);
-    // Compact the buffer.
-    for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
+/**
-        if (valid[i]) {
+ * This is called by findBlocksWithInteractions().  It compacts the list of blocks, identifies interactions
-            temp[sum[i]-1] = buffer[i];
+ * in them, and writes the result to global memory.
-            sum[i] = valid[i];
+ */
-            valid[i] = false;
+void storeInteractionData(unsigned short x, __local unsigned short* buffer, __local short* sum, __local ushort2* temp, __local int* atoms, __local int* numAtoms,
-            buffer[i] = (ushort2) 1;
+            __local int* baseIndex, __global unsigned int* interactionCount, __global ushort2* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize,
+            real4 invPeriodicBoxSize, __global const real4* posq, __local real4* posBuffer, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
+    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
+                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
+                                     0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
+    if (get_local_id(0) < TILE_SIZE) {
+        real4 pos = posq[x*TILE_SIZE+get_local_id(0)];
+#ifdef USE_PERIODIC
+        if (singlePeriodicCopy) {
+            // The box is small enough that we can just translate all the atoms into a single periodic
+            // box, then skip having to apply periodic boundary conditions later.
+            pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
        }
+#endif
+        posBuffer[get_local_id(0)] = pos;
+    }
+    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.
    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        sum[i] = (buffer[i] == INVALID ? 0 : 1);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    prefixSum(sum, temp);
+    int numValid = sum[BUFFER_SIZE-1];
-#ifndef WARPS_ARE_ATOMIC
+    // Compact the buffer.
-    // Filter the list of tiles by comparing the distance from each atom to the other bounding box.
-    // We only do this if we aren't already optimizing the computation using flags.
-    int index = get_local_id(0)&(TILE_SIZE-1);
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
-    int group = get_local_id(0)/TILE_SIZE;
+        if (buffer[i] != INVALID)
-    real4 center, boxSize, pos;
+            temp[sum[i]-1].x = buffer[i];
-    for (int tile = 0; tile < numValid; tile++) {
+    barrier(CLK_LOCAL_MEM_FENCE);
-        int x = temp[tile].x;
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
-        int y = temp[tile].y;
+        buffer[i] = temp[i].x;
-        if (x == y)
+    barrier(CLK_LOCAL_MEM_FENCE);
-            continue;
-        // Load an atom position and the bounding box the other block.
+    // Loop over the tiles and find specific interactions in them.
-#ifdef MAC_AMD_WORKAROUND
+    const int indexInWarp = get_local_id(0)%WARP_SIZE;
-        int box = (group == 0 ? x : y);
+    for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
-        int atom = (group == 0 ? y : x)*TILE_SIZE+index;
+        for (int i = get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
-        __global real* bc = (__global real*) blockCenter;
+            // Check each atom in block Y for interactions.
-        __global real* bb = (__global real*) blockBoundingBox;
-        __global real* ps = (__global real*) posq;
+            real4 pos = posq[buffer[base+i]*TILE_SIZE+indexInWarp];
-        center = (real4) (bc[4*box], bc[4*box+1], bc[4*box+2], 0);
+#ifdef USE_PERIODIC
-        boxSize = (real4) (bb[4*box], bb[4*box+1], bb[4*box+2], 0);
+            if (singlePeriodicCopy)
-        pos = (real4) (ps[4*atom], ps[4*atom+1], ps[4*atom+2], 0);
+                pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
-#else
-        center = blockCenter[(group == 0 ? x : y)];
-        boxSize = blockBoundingBox[(group == 0 ? x : y)];
-        pos = posq[(group == 0 ? y : x)*TILE_SIZE+index];
 #endif
+            bool interacts = false;
-        // Find the distance of the atom from the bounding box.
-        real4 delta = pos-center;
 #ifdef USE_PERIODIC
-        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            if (!singlePeriodicCopy) {
-        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                for (int j = 0; j < TILE_SIZE; j++) {
-        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    real4 delta = pos-posBuffer[j];
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                }
+            }
+            else {
 #endif
-        delta = max((real4) 0, fabs(delta)-boxSize);
+                for (int j = 0; j < TILE_SIZE; j++) {
-        __local ushort* flag = (__local ushort*) &buffer[tile];
+                    real4 delta = pos-posBuffer[j];
-        if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared)
+                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
-            flag[group] = false;
+                }
+#ifdef USE_PERIODIC
+            }
+#endif
+            sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
+        }
+        for (int i = numValid-base+get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
+            sum[i*WARP_SIZE+indexInWarp] = 0;
+        // Compact the list of atoms.
        barrier(CLK_LOCAL_MEM_FENCE);
-        if (flag[0] || flag[1]) {
+        prefixSum(sum, temp);
-            // This tile contains no interactions.
+        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+            if (sum[i] != (i == 0 ? 0 : sum[i-1]))
+                atoms[*numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
+        // Store them to global memory.
-            numValid--;
+        int atomsToStore = *numAtoms+sum[BUFFER_SIZE-1];
+        bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
+        int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
+        if (tilesToStore > 0) {
+            if (get_local_id(0) == 0)
+                *baseIndex = atom_add(interactionCount, tilesToStore);
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) == 0)
+                *numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
+            if (*baseIndex+tilesToStore <= maxTiles) {
+                if (get_local_id(0) < tilesToStore)
+                    interactingTiles[*baseIndex+get_local_id(0)] = (ushort2) (x, singlePeriodicCopy);
+                for (int i = get_local_id(0); i < tilesToStore*TILE_SIZE; i += get_local_size(0))
+                    interactingAtoms[*baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
+            }
+        }
+        else {
+            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) == 0)
-                temp[tile] = temp[numValid];
+                *numAtoms += sum[BUFFER_SIZE-1];
-            tile--;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
+        if (get_local_id(0) < *numAtoms && !storePartialTile)
+            atoms[get_local_id(0)] = atoms[tilesToStore*TILE_SIZE+get_local_id(0)];
    }
-#endif
-    // Store it to global memory.
+    if (numValid == 0 && *numAtoms > 0 && finish) {
+        // We didn't have any more tiles to process, but there were some atoms left over from a
+        // previous call to this function.  Save them now.
-    if (get_local_id(0) == 0)
+        if (get_local_id(0) == 0)
-        *baseIndex = atom_add(interactionCount, numValid);
+            *baseIndex = atom_add(interactionCount, 1);
-    barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
-    if (*baseIndex+numValid <= maxTiles)
+        if (*baseIndex < maxTiles) {
-        for (int i = get_local_id(0); i < numValid; i += GROUP_SIZE)
+            if (get_local_id(0) == 0)
-            interactingTiles[*baseIndex+i] = temp[i];
+                interactingTiles[*baseIndex] = (ushort2) (x, singlePeriodicCopy);
-    barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) < TILE_SIZE)
+                interactingAtoms[*baseIndex*TILE_SIZE+get_local_id(0)] = (get_local_id(0) < *numAtoms ? atoms[get_local_id(0)] : NUM_ATOMS);
+        }
+    }
+    // Reset the buffer for processing more tiles.
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        buffer[i] = INVALID;
 }
 /**
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
-__kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
+__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
        __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
-        __global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
+        __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
-        unsigned int endTileIndex) {
+        unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
-    __local ushort2 buffer[BUFFER_SIZE];
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
-    __local int valid[BUFFER_SIZE];
+        __global const int* restrict rebuildNeighborList) {
+    __local unsigned short buffer[BUFFER_SIZE];
    __local short sum[BUFFER_SIZE];
    __local ushort2 temp[BUFFER_SIZE];
+    __local int atoms[BUFFER_SIZE+TILE_SIZE];
+    __local real4 posBuffer[TILE_SIZE];
+    __local int exclusionsForX[MAX_EXCLUSIONS];
    __local int bufferFull;
    __local int globalIndex;
+    __local int numAtoms;
 #ifdef AMD_ATOMIC_WORK_AROUND
    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
@@ -173,142 +255,79 @@ __kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSi
    if (get_global_id(0) == get_local_id(0)+1)
        ((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
 #endif
+    if (rebuildNeighborList[0] == 0)
+        return; // The neighbor list doesn't need to be rebuilt.
    int valuesInBuffer = 0;
    if (get_local_id(0) == 0)
        bufferFull = false;
    for (int i = 0; i < BUFFER_GROUPS; ++i)
-        valid[i*GROUP_SIZE+get_local_id(0)] = false;
+        buffer[i*GROUP_SIZE+get_local_id(0)] = INVALID;
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int baseIndex = startTileIndex+get_group_id(0)*get_local_size(0); baseIndex < endTileIndex; baseIndex += get_global_size(0)) {
-        // Identify the pair of blocks to compare.
+    // Loop over blocks sorted by size.
-        int index = baseIndex+get_local_id(0);
+    for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
-        if (index < endTileIndex) {
+        if (get_local_id(0) == get_local_size(0)-1)
-            unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
+            numAtoms = 0;
-            unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
+        real2 sortedKey = sortedBlocks[i];
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+        unsigned short x = (unsigned short) sortedKey.y;
-                y += (x < y ? -1 : 1);
+        real4 blockCenterX = blockCenter[x];
-                x = (index-y*NUM_BLOCKS+y*(y+1)/2);
+        real4 blockSizeX = blockBoundingBox[x];
-            }
-            // Find the distance between the bounding boxes of the two cells.
-#ifdef MAC_AMD_WORKAROUND
+        // Load exclusion data for block x.
-            __global real* bc = (__global real*) blockCenter;
-            __global real* bb = (__global real*) blockBoundingBox;
+        const int exclusionStart = exclusionRowIndices[x];
-            real4 bcx = (real4) (bc[4*x], bc[4*x+1], bc[4*x+2], 0);
+        const int exclusionEnd = exclusionRowIndices[x+1];
-            real4 bcy = (real4) (bc[4*y], bc[4*y+1], bc[4*y+2], 0);
+        const int numExclusions = exclusionEnd-exclusionStart;
-            real4 delta = bcx-bcy;
+        for (int j = get_local_id(0); j < numExclusions; j += get_local_size(0))
-            real4 boxSizea = (real4) (bb[4*x], bb[4*x+1], bb[4*x+2], 0);
+            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
-            real4 boxSizeb = (real4) (bb[4*y], bb[4*y+1], bb[4*y+2], 0);
+        barrier(CLK_LOCAL_MEM_FENCE);
-#else
-            real4 delta = blockCenter[x]-blockCenter[y];
+        // Compare it to other blocks after this one in sorted order.
-            real4 boxSizea = blockBoundingBox[x];
-            real4 boxSizeb = blockBoundingBox[y];
+        for (int base = i+1; base < NUM_BLOCKS; base += get_local_size(0)) {
-#endif
+            int j = base+get_local_id(0);
+            real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : (real2) 0);
+            real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : (real4) 0);
+            real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : (real4) 0);
+            unsigned short y = (unsigned short) sortedKey2.y;
+            real4 delta = blockCenterX-blockCenterY;
 #ifdef USE_PERIODIC
            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-            delta.x = max((real) 0, fabs(delta.x)-boxSizea.x-boxSizeb.x);
+            delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
-            delta.y = max((real) 0, fabs(delta.y)-boxSizea.y-boxSizeb.y);
+            delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
-            delta.z = max((real) 0, fabs(delta.z)-boxSizea.z-boxSizeb.z);
+            delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
-            if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
+            bool hasExclusions = false;
+            for (int k = 0; k < numExclusions; k++)
+                hasExclusions |= (exclusionsForX[k] == y);
+            if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
                // Add this tile to the buffer.
                int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
-                valid[bufferIndex] = true;
+                buffer[bufferIndex] = y;
-                buffer[bufferIndex] = (ushort2) (x, y);
                valuesInBuffer++;
                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
                    bufferFull = true;
            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (bufferFull) {
-            storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, cutoffSquared, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
-            valuesInBuffer = 0;
-            if (get_local_id(0) == 0)
-                bufferFull = false;
            barrier(CLK_LOCAL_MEM_FENCE);
-        }
+            if (bufferFull) {
-    }
+                storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
-    storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, cutoffSquared, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
+                valuesInBuffer = 0;
-}
+                if (get_local_id(0) == 0)
+                    bufferFull = false;
-/**
+                barrier(CLK_LOCAL_MEM_FENCE);
- * Compare each atom in one block to the bounding box of another block, and set
- * flags for which ones are interacting.
- */
-__kernel void findInteractionsWithinBlocks(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global const ushort2* restrict tiles, __global const real4* restrict blockCenter,
-            __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) {
-    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    unsigned int warp = get_global_id(0)/TILE_SIZE;
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-    unsigned int index = get_local_id(0) & (TILE_SIZE - 1);
-    if (numTiles > maxTiles)
-        return;
-    unsigned int lasty = 0xFFFFFFFF;
-    real4 apos;
-    while (pos < end) {
-        // Extract the coordinates of this tile
-        ushort2 tileIndices = tiles[pos];
-        unsigned int x = tileIndices.x;
-        unsigned int y = tileIndices.y;
-        if (x == y) {
-            if (index == 0)
-                interactionFlags[pos] = 0xFFFFFFFF;
-        }
-        else {
-            // Load the bounding box for x and the atom positions for y.
-            real4 center = blockCenter[x];
-            real4 boxSize = blockBoundingBox[x];
-            if (y != lasty)
-                apos = posq[y*TILE_SIZE+index];
-            // Find the distance of the atom from the bounding box.
-            real4 delta = apos-center;
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-            delta = max((real4) 0, fabs(delta)-boxSize);
-            int thread = get_local_id(0);
-            flags[thread] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1 << index);
-            // Sum the flags.
-#ifdef WARPS_ARE_ATOMIC
-            if (index % 4 == 0)
-                flags[thread] += flags[thread+1]+flags[thread+2]+flags[thread+3];
-#else
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (index % 4 == 0)
-                flags[thread] += flags[thread+1]+flags[thread+2]+flags[thread+3];
-            barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-            if (index == 0) {
-                unsigned int allFlags = flags[thread]+flags[thread+4]+flags[thread+8]+flags[thread+12]+flags[thread+16]+flags[thread+20]+flags[thread+24]+flags[thread+28];
-                // Count how many flags are set, and based on that decide whether to compute all interactions
-                // or only a fraction of them.
-                unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
-                bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
-                bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
-                bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
-                bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
-                interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
            }
-            lasty = y;
        }
-        pos++;
+        storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
    }
+    // Record the positions the neighbor list is based on.
+    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
+        oldPositions[i] = posq[i];
 }
--- a/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
-#define TILE_SIZE 32
-#define GROUP_SIZE 64
-#define BUFFER_GROUPS 4
 #define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
 /**
 * Find a bounding box for the atoms in each block.
 */
-__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
+__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq,
+        __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
+        __global real2* restrict sortedBlocks) {
    int index = get_global_id(0);
    int base = index*TILE_SIZE;
    while (base < numAtoms) {
        real4 pos = posq[base];
 #ifdef USE_PERIODIC
-        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.xyz -= floor(pos.xyz*invPeriodicBoxSize.xyz)*periodicBoxSize.xyz;
-        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
        real4 firstPoint = pos;
 #endif
        real4 minPos = pos;
@@ -25,143 +22,211 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
 #ifdef USE_PERIODIC
-            pos.x -= floor((pos.x-firstPoint.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            pos.xyz -= floor((pos.xyz-firstPoint.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
-            pos.y -= floor((pos.y-firstPoint.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-            pos.z -= floor((pos.z-firstPoint.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
-        blockBoundingBox[index] = 0.5f*(maxPos-minPos);
+        real4 blockSize = 0.5f*(maxPos-minPos);
+        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
+        sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index);
        index += get_global_size(0);
        base = index*TILE_SIZE;
    }
    if (get_global_id(0) == 0)
+        rebuildNeighborList[0] = 0;
+}
+/**
+ * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
+ */
+__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
+        __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
+        __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
+        __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList) {
+    for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
+        int index = (int) sortedBlock[i].y;
+        sortedBlockCenter[i] = blockCenter[index];
+        sortedBlockBoundingBox[i] = blockBoundingBox[index];
+    }
+    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
+    bool rebuild = false;
+    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
+        real4 delta = oldPositions[i]-posq[i];
+        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
+            rebuild = true;
+    }
+    if (rebuild) {
+        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
+    }
 }
 /**
 * This is called by findBlocksWithInteractions().  It compacts the list of blocks and writes them
 * to global memory.
 */
-void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
+void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
-            __global unsigned int* interactionFlags, real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+            __global ushort2* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-            __global real4* posq, __global real4* blockCenter, __global real4* blockBoundingBox, unsigned int maxTiles) {
+            __global real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
-    // Filter the list of tiles by comparing the distance from each atom to the other bounding box.
+    real4 posBuffer[TILE_SIZE];
+    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
-    unsigned int flagsBuffer[2*BUFFER_SIZE];
+                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
-    real4 atomPositions[TILE_SIZE];
+                                     0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
-    int lasty = -1;
+    for (int i = 0; i < TILE_SIZE; i++) {
-    real4 centery, boxSizey;
+        real4 pos = posq[x*TILE_SIZE+i];
-    for (int tile = 0; tile < numValid; ) {
+#ifdef USE_PERIODIC
-        int x = buffer[tile].x;
+        if (singlePeriodicCopy) {
-        int y = buffer[tile].y;
+            // The box is small enough that we can just translate all the atoms into a single periodic
-        if (x == y) {
+            // box, then skip having to apply periodic boundary conditions later.
-            tile++;
-            continue;
+            pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
-        }
-        // Load the atom positions and bounding boxes.
-        real4 centerx = blockCenter[x];
-        real4 boxSizex = blockBoundingBox[x];
-        if (y != lasty) {
-            for (int atom = 0; atom < TILE_SIZE; atom++)
-                atomPositions[atom] = posq[y*TILE_SIZE+atom];
-            centery = blockCenter[y];
-            boxSizey = blockBoundingBox[y];
-            lasty = y;
        }
+#endif
+        posBuffer[i] = pos;
+    }
-        // Find the distance of each atom from the bounding box.
+    // Loop over the tiles and find specific interactions in them.
-        unsigned int flags1 = 0, flags2 = 0;
+    for (int tile = 0; tile < numValid; tile++) {
-        for (int atom = 0; atom < TILE_SIZE; atom++) {
+        for (int indexInTile = 0; indexInTile < TILE_SIZE; indexInTile++) {
-            real4 delta = atomPositions[atom]-centerx;
+            // Check each atom in block Y for interactions.
+            int atom = buffer[tile]*TILE_SIZE+indexInTile;
+            real4 pos = posq[atom];
 #ifdef USE_PERIODIC
-            delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+            if (singlePeriodicCopy)
+                pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-            delta = max((real4) 0, fabs(delta)-boxSizex);
+            bool interacts = false;
-            if (dot(delta.xyz, delta.xyz) < cutoffSquared)
-                flags1 += 1 << atom;
-            delta = posq[x*TILE_SIZE+atom]-centery;
 #ifdef USE_PERIODIC
-            delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+            if (!singlePeriodicCopy) {
+                for (int j = 0; j < TILE_SIZE && !interacts; j++) {
+                    real4 delta = pos-posBuffer[j];
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    interacts = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                }
+            }
+            else {
 #endif
-            delta = max((real4) 0, fabs(delta)-boxSizey);
+                for (int j = 0; j < TILE_SIZE && !interacts; j++) {
-            if (dot(delta.xyz, delta.xyz) < cutoffSquared)
+                    real4 delta = pos-posBuffer[j];
-                flags2 += 1 << atom;
+                    interacts = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
-        }
+                }
-        if (flags1 == 0 || flags2 == 0) {
+#ifdef USE_PERIODIC
-            // This tile contains no interactions.
+            }
+#endif
-            numValid--;
+            if (interacts)
-            buffer[tile] = buffer[numValid];
+                atoms[(*numAtoms)++] = atom;
-        }
+            if (*numAtoms == BUFFER_SIZE) {
-        else {
+                // The atoms buffer is full, so store it to global memory.
-            flagsBuffer[2*tile] = flags1;
-            flagsBuffer[2*tile+1] = flags2;
+                int tilesToStore = BUFFER_SIZE/TILE_SIZE;
-            tile++;
+                int baseIndex = atom_add(interactionCount, tilesToStore);
+                if (baseIndex+tilesToStore <= maxTiles) {
+                    for (int i = 0; i < tilesToStore; i++) {
+                        interactingTiles[baseIndex+i] = (ushort2) (x, singlePeriodicCopy);
+                        for (int j = 0; j < TILE_SIZE; j++)
+                            interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = atoms[i*TILE_SIZE+j];
+                    }
+                }
+                *numAtoms = 0;
+            }
        }
    }
-    // Store it to global memory.
+    if (*numAtoms > 0 && finish) {
+        // There are some leftover atoms, so save them now.
-    int baseIndex = atom_add(interactionCount, numValid);
-    if (baseIndex+numValid <= maxTiles)
+        int tilesToStore = (*numAtoms+TILE_SIZE-1)/TILE_SIZE;
-        for (int i = 0; i < numValid; i++) {
+        int baseIndex = atom_add(interactionCount, tilesToStore);
-            interactingTiles[baseIndex+i] = buffer[i];
+        if (baseIndex+tilesToStore <= maxTiles) {
-            interactionFlags[2*(baseIndex+i)] = flagsBuffer[2*i];
+            for (int i = 0; i < tilesToStore; i++) {
-            interactionFlags[2*(baseIndex+i)+1] = flagsBuffer[2*i+1];
+                interactingTiles[baseIndex+i] = (ushort2) (x, singlePeriodicCopy);
+                for (int j = 0; j < TILE_SIZE; j++) {
+                    int index = i*TILE_SIZE+j;
+                    interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = (index < *numAtoms ? atoms[index] : NUM_ATOMS);
+                }
+            }
        }
+    }
 }
 /**
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
-__kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
+__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
        __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
-        __global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
+        __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
-        unsigned int endTileIndex) {
+        unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
-    ushort2 buffer[BUFFER_SIZE];
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
-    int valuesInBuffer = 0;
+        __global const int* restrict rebuildNeighborList) {
-    const int numTiles = endTileIndex-startTileIndex;
+    if (rebuildNeighborList[0] == 0)
-    unsigned int start = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
+        return; // The neighbor list doesn't need to be rebuilt.
-    unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
+    unsigned short buffer[BUFFER_SIZE];
-    for (int index = start; index < end; index++) {
+    int atoms[BUFFER_SIZE];
-        // Identify the pair of blocks to compare.
+    int exclusionsForX[MAX_EXCLUSIONS];
+    int valuesInBuffer;
-        unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
+    int numAtoms;
-        unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
-        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+    // Loop over blocks sorted by size.
-            y += (x < y ? -1 : 1);
-            x = (index-y*NUM_BLOCKS+y*(y+1)/2);
+    for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
-        }
+        valuesInBuffer = 0;
+        numAtoms = 0;
-        // Find the distance between the bounding boxes of the two cells.
+        real2 sortedKey = sortedBlocks[i];
+        unsigned short x = (unsigned short) sortedKey.y;
+        real4 blockCenterX = blockCenter[x];
+        real4 blockSizeX = blockBoundingBox[x];
-        real4 delta = blockCenter[x]-blockCenter[y];
+        // Load exclusion data for block x.
+        const int exclusionStart = exclusionRowIndices[x];
+        const int exclusionEnd = exclusionRowIndices[x+1];
+        const int numExclusions = exclusionEnd-exclusionStart;
+        for (int j = 0; j < numExclusions; j++)
+            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
+        // Compare it to other blocks after this one in sorted order.
+        for (int j = i+1; j < NUM_BLOCKS; j++) {
+            real2 sortedKey2 = sortedBlocks[j];
+            unsigned short y = (unsigned short) sortedKey2.y;
+            bool hasExclusions = false;
+            for (int k = 0; k < numExclusions; k++)
+                hasExclusions |= (exclusionsForX[k] == y);
+            if (hasExclusions)
+                continue;
+            real4 blockCenterY = sortedBlockCenter[j];
+            real4 blockSizeY = sortedBlockBoundingBox[j];
+            real4 delta = blockCenterX-blockCenterY;
 #ifdef USE_PERIODIC
-        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-        real4 boxSizea = blockBoundingBox[x];
+            delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
-        real4 boxSizeb = blockBoundingBox[y];
+            delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
-        delta.x = max((real) 0, fabs(delta.x)-boxSizea.x-boxSizeb.x);
+            delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
-        delta.y = max((real) 0, fabs(delta.y)-boxSizea.y-boxSizeb.y);
+            if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED) {
-        delta.z = max((real) 0, fabs(delta.z)-boxSizea.z-boxSizeb.z);
+                // Add this tile to the buffer.
-        if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
-            // Add this tile to the buffer.
-            buffer[valuesInBuffer++] = (ushort2) (x, y);
+                buffer[valuesInBuffer++] = y;
-            if (valuesInBuffer == BUFFER_SIZE) {
+                if (valuesInBuffer == BUFFER_SIZE) {
-                storeInteractionData(buffer, valuesInBuffer, interactionCount, interactingTiles, interactionFlags, cutoffSquared, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
+                    storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, blockCenterX, blockSizeX, maxTiles, false);
-                valuesInBuffer = 0;
+                    valuesInBuffer = 0;
+                }
            }
        }
+        storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, blockCenterX, blockSizeX, maxTiles, true);
    }
-    storeInteractionData(buffer, valuesInBuffer, interactionCount, interactingTiles, interactionFlags, cutoffSquared, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
+    // Record the positions the neighbor list is based on.
+    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
+        oldPositions[i] = posq[i];
 }
--- a/platforms/opencl/src/kernels/gbsaObc.cl
+++ b/platforms/opencl/src/kernels/gbsaObc.cl
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
+#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
+typedef struct {
+    real x, y, z;
+    real q;
+    float radius, scaledRadius;
+    real bornSum;
+} AtomData1;
+/**
+ * Compute the Born sum.
+ */
+__kernel void computeBornSum(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict global_bornSum,
+#else
+        __global real* restrict global_bornSum,
+#endif
+        __global const real4* restrict posq, __global const float2* restrict global_params,
+#ifdef USE_CUTOFF
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms,
+#else
+        unsigned int numTiles,
+#endif
+        __global const ushort2* exclusionTiles) {
+    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    const unsigned int warp = get_global_id(0)/TILE_SIZE;
+    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+    const unsigned int tbx = get_local_id(0) - tgx;
+    __local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        real bornSum = 0.0f;
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 posq1 = posq[atom1];
+        float2 params1 = global_params[atom1];
+        if (x == y) {
+            // This tile is on the diagonal.
+            localData[get_local_id(0)].x = posq1.x;
+            localData[get_local_id(0)].y = posq1.y;
+            localData[get_local_id(0)].z = posq1.z;
+            localData[get_local_id(0)].q = posq1.w;
+            localData[get_local_id(0)].radius = params1.x;
+            localData[get_local_id(0)].scaledRadius = params1.y;
+            SYNC_WARPS;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
+#ifdef USE_PERIODIC
+                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+#else
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
+                    real rScaledRadiusJ = r+params2.y;
+                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
+                        real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
+                        real u_ij = RECIP(rScaledRadiusJ);
+                        real l_ij2 = l_ij*l_ij;
+                        real u_ij2 = u_ij*u_ij;
+                        real ratio = LOG(u_ij * RECIP(l_ij));
+                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                         (params2.y*params2.y*invR)*(l_ij2-u_ij2));
+                        bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
+                    }
+                }
+                SYNC_WARPS;
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            unsigned int j = y*TILE_SIZE + tgx;
+            real4 tempPosq = posq[j];
+            localData[get_local_id(0)].x = tempPosq.x;
+            localData[get_local_id(0)].y = tempPosq.y;
+            localData[get_local_id(0)].z = tempPosq.z;
+            localData[get_local_id(0)].q = tempPosq.w;
+            float2 tempParams = global_params[j];
+            localData[get_local_id(0)].radius = tempParams.x;
+            localData[get_local_id(0)].scaledRadius = tempParams.y;
+            localData[get_local_id(0)].bornSum = 0.0f;
+            SYNC_WARPS;
+            // Compute the full set of interactions in this tile.
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
+#ifdef USE_PERIODIC
+                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+#else
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
+                    real rScaledRadiusJ = r+params2.y;
+                    if (params1.x < rScaledRadiusJ) {
+                        real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
+                        real u_ij = RECIP(rScaledRadiusJ);
+                        real l_ij2 = l_ij*l_ij;
+                        real u_ij2 = u_ij*u_ij;
+                        real ratio = LOG(u_ij * RECIP(l_ij));
+                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                         (params2.y*params2.y*invR)*(l_ij2-u_ij2));
+                        bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
+                    }
+                    real rScaledRadiusI = r+params1.y;
+                    if (params2.x < rScaledRadiusI) {
+                        real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
+                        real u_ij = RECIP(rScaledRadiusI);
+                        real l_ij2 = l_ij*l_ij;
+                        real u_ij2 = u_ij*u_ij;
+                        real ratio = LOG(u_ij * RECIP(l_ij));
+                        real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                         (params1.y*params1.y*invR)*(l_ij2-u_ij2));
+                        term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
+                        localData[tbx+tj].bornSum += term;
+                    }
+                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
+                SYNC_WARPS;
+            }
+        }
+        // Write results.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        unsigned int offset = x*TILE_SIZE + tgx;
+        atom_add(&global_bornSum[offset], (long) (bornSum*0x100000000));
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0x100000000));
+        }
+#else
+        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        global_bornSum[offset1] += bornSum;
+        if (x != y)
+            global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
+#endif
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    int pos = warp*numTiles/totalWarps;
+    int end = (warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
+    __local int skipTiles[FORCE_WORK_GROUP_SIZE];
+    skipTiles[get_local_id(0)] = -1;
+    while (pos < end) {
+        real bornSum = 0;
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            SYNC_WARPS;
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                SYNC_WARPS;
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[get_local_id(0)] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+                SYNC_WARPS;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            float2 params1 = global_params[atom1];
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[get_local_id(0)] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                real4 tempPosq = posq[j];
+                localData[get_local_id(0)].x = tempPosq.x;
+                localData[get_local_id(0)].y = tempPosq.y;
+                localData[get_local_id(0)].z = tempPosq.z;
+                localData[get_local_id(0)].q = tempPosq.w;
+                float2 tempParams = global_params[j];
+                localData[get_local_id(0)].radius = tempParams.x;
+                localData[get_local_id(0)].scaledRadius = tempParams.y;
+                localData[get_local_id(0)].bornSum = 0.0f;
+            }
+            SYNC_WARPS;
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                localData[get_local_id(0)].x -= floor((localData[get_local_id(0)].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[get_local_id(0)].y -= floor((localData[get_local_id(0)].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[get_local_id(0)].z -= floor((localData[get_local_id(0)].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                SYNC_WARPS;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    int atom2 = atomIndices[tbx+tj];
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
+                        real rScaledRadiusJ = r+params2.y;
+                        if (params1.x < rScaledRadiusJ) {
+                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
+                            real u_ij = RECIP(rScaledRadiusJ);
+                            real l_ij2 = l_ij*l_ij;
+                            real u_ij2 = u_ij*u_ij;
+                            real ratio = LOG(u_ij * RECIP(l_ij));
+                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
+                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
+                        }
+                        real rScaledRadiusI = r+params1.y;
+                        if (params2.x < rScaledRadiusI) {
+                            real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
+                            real u_ij = RECIP(rScaledRadiusI);
+                            real l_ij2 = l_ij*l_ij;
+                            real u_ij2 = u_ij*u_ij;
+                            real ratio = LOG(u_ij * RECIP(l_ij));
+                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                             (params1.y*params1.y*invR)*(l_ij2-u_ij2));
+                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
+                            localData[tbx+tj].bornSum += term;
+                        }
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    int atom2 = atomIndices[tbx+tj];
+#ifdef USE_CUTOFF
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+#else
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
+                        real rScaledRadiusJ = r+params2.y;
+                        if (params1.x < rScaledRadiusJ) {
+                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
+                            real u_ij = RECIP(rScaledRadiusJ);
+                            real l_ij2 = l_ij*l_ij;
+                            real u_ij2 = u_ij*u_ij;
+                            real ratio = LOG(u_ij * RECIP(l_ij));
+                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
+                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
+                        }
+                        real rScaledRadiusI = r+params1.y;
+                        if (params2.x < rScaledRadiusI) {
+                            real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
+                            real u_ij = RECIP(rScaledRadiusI);
+                            real l_ij2 = l_ij*l_ij;
+                            real u_ij2 = u_ij*u_ij;
+                            real ratio = LOG(u_ij * RECIP(l_ij));
+                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                             (params1.y*params1.y*invR)*(l_ij2-u_ij2));
+                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
+                            localData[tbx+tj].bornSum += term;
+                        }
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            // Write results.
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[get_local_id(0)];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+            if (atom2 < PADDED_NUM_ATOMS)
+                atom_add(&global_bornSum[atom2], (long) (localData[get_local_id(0)].bornSum*0x100000000));
+#else
+            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
+            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
+            global_bornSum[offset1] += bornSum;
+            if (atom2 < PADDED_NUM_ATOMS)
+                global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
+#endif
+        }
+        pos++;
+    }
+}
+typedef struct {
+    real x, y, z;
+    real q;
+    real fx, fy, fz, fw;
+    real bornRadius;
+} AtomData2;
+/**
+ * First part of computing the GBSA interaction.
+ */
+__kernel void computeGBSAForce1(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict forceBuffers, __global long* restrict global_bornForce,
+#else
+        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
+#endif
+        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
+#ifdef USE_CUTOFF
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms,
+#else
+        unsigned int numTiles,
+#endif
+        __global const ushort2* exclusionTiles) {
+    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    const unsigned int warp = get_global_id(0)/TILE_SIZE;
+    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+    const unsigned int tbx = get_local_id(0) - tgx;
+    real energy = 0.0f;
+    __local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        real4 force = 0.0f;
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 posq1 = posq[atom1];
+        real bornRadius1 = global_bornRadii[atom1];
+        if (x == y) {
+            // This tile is on the diagonal.
+            const unsigned int localAtomIndex = get_local_id(0);
+            localData[localAtomIndex].x = posq1.x;
+            localData[localAtomIndex].y = posq1.y;
+            localData[localAtomIndex].z = posq1.z;
+            localData[localAtomIndex].q = posq1.w;
+            localData[get_local_id(0)].bornRadius = bornRadius1;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
+                    real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        real bornRadius2 = localData[tbx+j].bornRadius;
+                        real alpha2_ij = bornRadius1*bornRadius2;
+                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
+                        real expTerm = EXP(-D_ij);
+                        real denominator2 = r2 + alpha2_ij*expTerm;
+                        real denominator = SQRT(denominator2);
+                        real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                        real Gpol = tempEnergy*RECIP(denominator2);
+                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                        force.w += dGpol_dalpha2_ij*bornRadius2;
+                        energy += 0.5f*tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+#ifdef USE_CUTOFF
+                    }
+#endif
+                SYNC_WARPS;
+                }
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            unsigned int j = y*TILE_SIZE + tgx;
+            real4 tempPosq = posq[j];
+            localData[get_local_id(0)].x = tempPosq.x;
+            localData[get_local_id(0)].y = tempPosq.y;
+            localData[get_local_id(0)].z = tempPosq.z;
+            localData[get_local_id(0)].q = tempPosq.w;
+            localData[get_local_id(0)].bornRadius = global_bornRadii[j];
+            localData[get_local_id(0)].fx = 0.0f;
+            localData[get_local_id(0)].fy = 0.0f;
+            localData[get_local_id(0)].fz = 0.0f;
+            localData[get_local_id(0)].fw = 0.0f;
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
+                    real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        real bornRadius2 = localData[tbx+tj].bornRadius;
+                        real alpha2_ij = bornRadius1*bornRadius2;
+                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
+                        real expTerm = EXP(-D_ij);
+                        real denominator2 = r2 + alpha2_ij*expTerm;
+                        real denominator = SQRT(denominator2);
+                        real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                        real Gpol = tempEnergy*RECIP(denominator2);
+                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                        force.w += dGpol_dalpha2_ij*bornRadius2;
+                        energy += tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        localData[tbx+tj].fx += delta.x;
+                        localData[tbx+tj].fy += delta.y;
+                        localData[tbx+tj].fz += delta.z;
+                        localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
+#ifdef USE_CUTOFF
+                    }
+#endif
+                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
+                SYNC_WARPS;
+            }
+        }
+        // Write results.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        unsigned int offset = x*TILE_SIZE + tgx;
+        atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
+        atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+        atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+        atom_add(&global_bornForce[offset], (long) (force.w*0x100000000));
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
+            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
+            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
+            atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000));
+        }
+#else
+        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
+        forceBuffers[offset1].xyz += force.xyz;
+        global_bornForce[offset1] += force.w;
+        if (x != y) {
+            forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+            global_bornForce[offset2] += localData[get_local_id(0)].fw;
+        }
+#endif
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    int pos = warp*numTiles/totalWarps;
+    int end = (warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
+    __local int skipTiles[FORCE_WORK_GROUP_SIZE];
+    skipTiles[get_local_id(0)] = -1;
+    while (pos < end) {
+        real4 force = 0;
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            SYNC_WARPS;
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                SYNC_WARPS;
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[get_local_id(0)] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+                SYNC_WARPS;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            real bornRadius1 = global_bornRadii[atom1];
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[get_local_id(0)] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                real4 tempPosq = posq[j];
+                localData[get_local_id(0)].x = tempPosq.x;
+                localData[get_local_id(0)].y = tempPosq.y;
+                localData[get_local_id(0)].z = tempPosq.z;
+                localData[get_local_id(0)].q = tempPosq.w;
+                localData[get_local_id(0)].bornRadius = global_bornRadii[j];
+                localData[get_local_id(0)].fx = 0.0f;
+                localData[get_local_id(0)].fy = 0.0f;
+                localData[get_local_id(0)].fz = 0.0f;
+                localData[get_local_id(0)].fw = 0.0f;
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                localData[get_local_id(0)].x -= floor((localData[get_local_id(0)].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[get_local_id(0)].y -= floor((localData[get_local_id(0)].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[get_local_id(0)].z -= floor((localData[get_local_id(0)].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                SYNC_WARPS;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = atomIndices[tbx+tj];
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        if (r2 < CUTOFF_SQUARED) {
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            real bornRadius2 = localData[tbx+tj].bornRadius;
+                            real alpha2_ij = bornRadius1*bornRadius2;
+                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
+                            real expTerm = EXP(-D_ij);
+                            real denominator2 = r2 + alpha2_ij*expTerm;
+                            real denominator = SQRT(denominator2);
+                            real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                            real Gpol = tempEnergy*RECIP(denominator2);
+                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                            force.w += dGpol_dalpha2_ij*bornRadius2;
+                            energy += tempEnergy;
+                            delta.xyz *= dEdR;
+                            force.xyz -= delta.xyz;
+                            localData[tbx+tj].fx += delta.x;
+                            localData[tbx+tj].fy += delta.y;
+                            localData[tbx+tj].fz += delta.z;
+                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
+                        }
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = atomIndices[tbx+tj];
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            real bornRadius2 = localData[tbx+tj].bornRadius;
+                            real alpha2_ij = bornRadius1*bornRadius2;
+                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
+                            real expTerm = EXP(-D_ij);
+                            real denominator2 = r2 + alpha2_ij*expTerm;
+                            real denominator = SQRT(denominator2);
+                            real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                            real Gpol = tempEnergy*RECIP(denominator2);
+                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                            force.w += dGpol_dalpha2_ij*bornRadius2;
+                            energy += tempEnergy;
+                            delta.xyz *= dEdR;
+                            force.xyz -= delta.xyz;
+                            localData[tbx+tj].fx += delta.x;
+                            localData[tbx+tj].fy += delta.y;
+                            localData[tbx+tj].fz += delta.z;
+                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
+#ifdef USE_CUTOFF
+                        }
+#endif
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SYNC_WARPS;
+                }
+            }
+            // Write results.
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[get_local_id(0)];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+            atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+            atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
+                atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
+                atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
+                atom_add(&global_bornForce[atom2], (long) (localData[get_local_id(0)].fw*0x100000000));
+            }
+#else
+            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
+            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
+            forceBuffers[offset1].xyz += force.xyz;
+            global_bornForce[offset1] += force.w;
+            if (atom2 < PADDED_NUM_ATOMS) {
+                forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+                global_bornForce[offset2] += localData[get_local_id(0)].fw;
+            }
+#endif
+        }
+        pos++;
+    }
+    energyBuffer[get_global_id(0)] += energy;
+}
--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
-#define TILE_SIZE 32
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
 typedef struct {
    real x, y, z;
@@ -10,58 +12,42 @@ typedef struct {
 /**
 * Compute the Born sum.
 */
+__kernel void computeBornSum(
-__kernel void computeBornSum(__global real* restrict global_bornSum, __global const real4* restrict posq, __global const float2* restrict global_params,
+#ifdef SUPPORTS_64_BIT_ATOMICS
-#ifdef USE_CUTOFF
+        __global long* restrict global_bornSum,
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
 #else
-        unsigned int numTiles) {
+        __global real* restrict global_bornSum,
 #endif
+        __global const real4* restrict posq, __global const float2* restrict global_params,
 #ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms,
-    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
 #else
-    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+        unsigned int numTiles,
-    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
-    unsigned int lasty = 0xFFFFFFFF;
+        __global const ushort2* exclusionTiles) {
    __local AtomData1 localData[TILE_SIZE];
-    while (pos < end) {
+    // First loop: process tiles that contain exclusions.
-        // Extract the coordinates of this tile
-        unsigned int x, y;
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-#ifdef USE_CUTOFF
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-        if (numTiles <= maxTiles) {
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-            ushort2 tileIndices = tiles[pos];
+        const ushort2 tileIndices = exclusionTiles[pos];
-            x = tileIndices.x;
+        const unsigned int x = tileIndices.x;
-            y = tileIndices.y;
+        const unsigned int y = tileIndices.y;
-        }
-        else
+        // Load the data for this tile.
-#endif
-        {
+        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            unsigned int j = y*TILE_SIZE + localAtomIndex;
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            real4 tempPosq = posq[j];
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            localData[localAtomIndex].x = tempPosq.x;
-                y += (x < y ? -1 : 1);
+            localData[localAtomIndex].y = tempPosq.y;
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            localData[localAtomIndex].z = tempPosq.z;
-            }
+            localData[localAtomIndex].q = tempPosq.w;
-        }
+            float2 tempParams = global_params[j];
+            localData[localAtomIndex].radius = tempParams.x;
-        // Load the data for this tile if we don't already have it cached.
+            localData[localAtomIndex].scaledRadius = tempParams.y;
-        if (lasty != y) {
-            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
-                unsigned int j = y*TILE_SIZE + localAtomIndex;
-                real4 tempPosq = posq[j];
-                localData[localAtomIndex].x = tempPosq.x;
-                localData[localAtomIndex].y = tempPosq.y;
-                localData[localAtomIndex].z = tempPosq.z;
-                localData[localAtomIndex].q = tempPosq.w;
-                float2 tempParams = global_params[j];
-                localData[localAtomIndex].radius = tempParams.x;
-                localData[localAtomIndex].scaledRadius = tempParams.y;
-            }
        }
        if (x == y) {
            // This tile is on the diagonal.
@@ -93,31 +79,31 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
-                            bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                             (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
+                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                            if (params1.x < params2.y-r)
+                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                                bornSum += 2.0f*(RECIP(params1.x)-l_ij);
                        }
                    }
                }
                // Write results.
-                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+#else
+                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                global_bornSum[offset] += bornSum;
+#endif
            }
        }
        else {
            // This is an off-diagonal tile.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++)
-                localData[tgx].bornSum = 0.0f;
+                localData[tgx].bornSum = 0;
-            // Compute the full set of interactions in this tile.
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
-                real bornSum = 0.0f;
+                real bornSum = 0;
                real4 posq1 = posq[atom1];
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
@@ -126,7 +112,7 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
 #ifdef USE_PERIODIC
                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-                    real r2 = dot(delta.xyz, delta.xyz);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
@@ -134,8 +120,6 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
 #endif
                        real invR = RSQRT(r2);
                        real r = RECIP(invR);
                        float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
                        real rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
@@ -144,10 +128,9 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
-                            bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                             (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
+                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                            if (params1.x < params2.y-r)
+                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                                bornSum += 2.0f*(RECIP(params1.x)-l_ij);
                        }
                        real rScaledRadiusI = r+params1.y;
                        if (params2.x < rScaledRadiusI) {
@@ -156,10 +139,9 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
-                            real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
-                                             (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
+                                             (params1.y*params1.y*invR)*(l_ij2-u_ij2));
-                            if (params2.x < params1.y-r)
+                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
-                                term += 2.0f*(RECIP(params2.x)-l_ij);
                            localData[j].bornSum += term;
                        }
                    }
@@ -167,91 +149,299 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
               // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+#else
                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                global_bornSum[offset] += bornSum;
+#endif
            }
-            // Write results
+            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                unsigned int offset = y*TILE_SIZE + tgx;
+                atom_add(&global_bornSum[offset], (long) (localData[tgx].bornSum*0x100000000));
+#else
                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                global_bornSum[offset] += localData[tgx].bornSum;
+#endif
            }
        }
-        lasty = y;
-        pos++;
    }
-}
-typedef struct {
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
-    real x, y, z;
+    // of them (no cutoff).
-    real q;
-    real fx, fy, fz, fw;
-    real bornRadius;
-} AtomData2;
-/**
- * First part of computing the GBSA interaction.
- */
-__kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
-        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
-#else
-        unsigned int numTiles) {
-#endif
 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+    int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+    int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
 #else
-    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+    int pos = get_group_id(0)*numTiles/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
+    int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
-    real energy = 0.0f;
+    int nextToSkip = -1;
-    unsigned int lasty = 0xFFFFFFFF;
+    int currentSkipIndex = 0;
-    __local AtomData2 localData[TILE_SIZE];
+    __local int atomIndices[TILE_SIZE];
    while (pos < end) {
-        // Extract the coordinates of this tile
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
        unsigned int x, y;
+        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
        if (numTiles <= maxTiles) {
            ushort2 tileIndices = tiles[pos];
            x = tileIndices.x;
-            y = tileIndices.y;
+            singlePeriodicCopy = tileIndices.y;
        }
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            }
-        }
-        // Load the data for this tile if we don't already have it cached.
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (nextToSkip < pos) {
+                if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[currentSkipIndex++];
+                    nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    nextToSkip = end;
+            }
+            includeTile = (nextToSkip != pos);
+        }
+        if (includeTile) {
+            // Load the data for this tile.
-        if (lasty != y) {
            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
-                unsigned int j = y*TILE_SIZE + localAtomIndex;
+#ifdef USE_CUTOFF
-                real4 tempPosq = posq[j];
+                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
-                localData[localAtomIndex].x = tempPosq.x;
+#else
-                localData[localAtomIndex].y = tempPosq.y;
+                unsigned int j = y*TILE_SIZE+localAtomIndex;
-                localData[localAtomIndex].z = tempPosq.z;
+#endif
-                localData[localAtomIndex].q = tempPosq.w;
+                atomIndices[localAtomIndex] = j;
-                localData[localAtomIndex].bornRadius = global_bornRadii[j];
+                if (j < PADDED_NUM_ATOMS) {
+                    real4 tempPosq = posq[j];
+                    localData[localAtomIndex].x = tempPosq.x;
+                    localData[localAtomIndex].y = tempPosq.y;
+                    localData[localAtomIndex].z = tempPosq.z;
+                    localData[localAtomIndex].q = tempPosq.w;
+                    float2 tempParams = global_params[j];
+                    localData[localAtomIndex].radius = tempParams.x;
+                    localData[localAtomIndex].scaledRadius = tempParams.y;
+                    localData[localAtomIndex].bornSum = 0.0f;
+                }
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                }
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
+                    real bornSum = 0;
+                    real4 posq1 = posq[atom1];
+                    float2 params1 = global_params[atom1];
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        int atom2 = atomIndices[j];
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
+                            real rScaledRadiusJ = r+params2.y;
+                            if (params1.x < rScaledRadiusJ) {
+                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
+                                real u_ij = RECIP(rScaledRadiusJ);
+                                real l_ij2 = l_ij*l_ij;
+                                real u_ij2 = u_ij*u_ij;
+                                real ratio = LOG(u_ij * RECIP(l_ij));
+                                bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                                 (params2.y*params2.y*invR)*(l_ij2-u_ij2));
+                                bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
+                            }
+                            real rScaledRadiusI = r+params1.y;
+                            if (params2.x < rScaledRadiusI) {
+                                real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
+                                real u_ij = RECIP(rScaledRadiusI);
+                                real l_ij2 = l_ij*l_ij;
+                                real u_ij2 = u_ij*u_ij;
+                                real ratio = LOG(u_ij * RECIP(l_ij));
+                                real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                                 (params1.y*params1.y*invR)*(l_ij2-u_ij2));
+                                term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
+                                localData[j].bornSum += term;
+                            }
+                        }
+                    }
+                    // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+#else
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    global_bornSum[offset] += bornSum;
+#endif
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
+                    real bornSum = 0;
+                    real4 posq1 = posq[atom1];
+                    float2 params1 = global_params[atom1];
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        int atom2 = atomIndices[j];
+#ifdef USE_CUTOFF
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+#else
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
+                            real rScaledRadiusJ = r+params2.y;
+                            if (params1.x < rScaledRadiusJ) {
+                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
+                                real u_ij = RECIP(rScaledRadiusJ);
+                                real l_ij2 = l_ij*l_ij;
+                                real u_ij2 = u_ij*u_ij;
+                                real ratio = LOG(u_ij * RECIP(l_ij));
+                                bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                                 (params2.y*params2.y*invR)*(l_ij2-u_ij2));
+                                bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
+                            }
+                            real rScaledRadiusI = r+params1.y;
+                            if (params2.x < rScaledRadiusI) {
+                                real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
+                                real u_ij = RECIP(rScaledRadiusI);
+                                real l_ij2 = l_ij*l_ij;
+                                real u_ij2 = u_ij*u_ij;
+                                real ratio = LOG(u_ij * RECIP(l_ij));
+                                real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                                 (params1.y*params1.y*invR)*(l_ij2-u_ij2));
+                                term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
+                                localData[j].bornSum += term;
+                            }
+                        }
+                    }
+                    // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+#else
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    global_bornSum[offset] += bornSum;
+#endif
+                }
+            }
+            // Write results.
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef USE_CUTOFF
+                unsigned int atom2 = atomIndices[tgx];
+#else
+                unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+                if (atom2 < PADDED_NUM_ATOMS) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&global_bornSum[atom2], (long) (localData[tgx].bornSum*0x100000000));
+#else
+                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    global_bornSum[offset] += localData[tgx].bornSum;
+#endif
+                }
            }
        }
+        pos++;
+    }
+}
+typedef struct {
+    real x, y, z;
+    real q;
+    real fx, fy, fz, fw;
+    real bornRadius;
+} AtomData2;
+/**
+ * First part of computing the GBSA interaction.
+ */
+__kernel void computeGBSAForce1(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict forceBuffers, __global long* restrict global_bornForce,
+#else
+        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
+#endif
+        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
+#ifdef USE_CUTOFF
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms,
+#else
+        unsigned int numTiles,
+#endif
+        __global const ushort2* exclusionTiles) {
+    real energy = 0.0f;
+    __local AtomData2 localData[TILE_SIZE];
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        // Load the data for this tile.
+        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+            unsigned int j = y*TILE_SIZE + localAtomIndex;
+            real4 tempPosq = posq[j];
+            localData[localAtomIndex].x = tempPosq.x;
+            localData[localAtomIndex].y = tempPosq.y;
+            localData[localAtomIndex].z = tempPosq.z;
+            localData[localAtomIndex].q = tempPosq.w;
+            localData[localAtomIndex].bornRadius = global_bornRadii[j];
+        }
        if (x == y) {
            // This tile is on the diagonal.
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
-                real4 force = 0.0f;
+                real4 force = 0;
                real4 posq1 = posq[atom1];
                real bornRadius1 = global_bornRadii[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
@@ -260,7 +450,7 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
 #ifdef USE_PERIODIC
                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-                    real r2 = dot(delta.xyz, delta.xyz);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
@@ -277,35 +467,40 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
                        real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                        force.w += dGpol_dalpha2_ij*bornRadius2;
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                        force.w += dGpol_dalpha2_ij*bornRadius2;
                        energy += 0.5f*tempEnergy;
-                        force.xyz -= delta.xyz*dEdR;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
                    }
                }
                // Write results.
-                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+                atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+#else
+                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
                global_bornForce[offset] += force.w;
+#endif
            }
        }
        else {
            // This is an off-diagonal tile.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                localData[tgx].fx = 0.0f;
+                localData[tgx].fx = 0;
-                localData[tgx].fy = 0.0f;
+                localData[tgx].fy = 0;
-                localData[tgx].fz = 0.0f;
+                localData[tgx].fz = 0;
-                localData[tgx].fw = 0.0f;
+                localData[tgx].fw = 0;
            }
-            // Compute the full set of interactions in this tile.
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
-                real4 force = 0.0f;
+                real4 force = 0;
                real4 posq1 = posq[atom1];
                real bornRadius1 = global_bornRadii[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
@@ -314,7 +509,7 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
 #ifdef USE_PERIODIC
                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-                    real r2 = dot(delta.xyz, delta.xyz);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
@@ -331,8 +526,8 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
                        real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                        force.w += dGpol_dalpha2_ij*bornRadius2;
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                        force.w += dGpol_dalpha2_ij*bornRadius2;
                        energy += tempEnergy;
                        delta.xyz *= dEdR;
                        force.xyz -= delta.xyz;
@@ -343,16 +538,30 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
                    }
                }
-                // Write results for atom1.
+               // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+                atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+#else
                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
                global_bornForce[offset] += force.w;
+#endif
            }
-            // Write results
+            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                unsigned int offset = y*TILE_SIZE + tgx;
+                atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000));
+                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
+                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
+                atom_add(&global_bornForce[offset], (long) (localData[tgx].fw*0x100000000));
+#else
                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                real4 f = forceBuffers[offset];
                f.x += localData[tgx].fx;
@@ -360,9 +569,231 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
                f.z += localData[tgx].fz;
                forceBuffers[offset] = f;
                global_bornForce[offset] += localData[tgx].fw;
+#endif
+            }
+        }
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+    int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+#else
+    int pos = get_group_id(0)*numTiles/get_num_groups(0);
+    int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
+#endif
+    int nextToSkip = -1;
+    int currentSkipIndex = 0;
+    __local int atomIndices[TILE_SIZE];
+    while (pos < end) {
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (nextToSkip < pos) {
+                if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[currentSkipIndex++];
+                    nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    nextToSkip = end;
+            }
+            includeTile = (nextToSkip != pos);
+        }
+        if (includeTile) {
+            // Load the data for this tile.
+            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+#ifdef USE_CUTOFF
+                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+#else
+                unsigned int j = y*TILE_SIZE+localAtomIndex;
+#endif
+                atomIndices[localAtomIndex] = j;
+                if (j < PADDED_NUM_ATOMS) {
+                    real4 tempPosq = posq[j];
+                    localData[localAtomIndex].x = tempPosq.x;
+                    localData[localAtomIndex].y = tempPosq.y;
+                    localData[localAtomIndex].z = tempPosq.z;
+                    localData[localAtomIndex].q = tempPosq.w;
+                    localData[localAtomIndex].bornRadius = global_bornRadii[j];
+                    localData[localAtomIndex].fx = 0.0f;
+                    localData[localAtomIndex].fy = 0.0f;
+                    localData[localAtomIndex].fz = 0.0f;
+                    localData[localAtomIndex].fw = 0.0f;
+                }
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                }
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
+                    real4 force = 0;
+                    real4 posq1 = posq[atom1];
+                    posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    float bornRadius1 = global_bornRadii[atom1];
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        int atom2 = atomIndices[j];
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            real bornRadius2 = localData[j].bornRadius;
+                            real alpha2_ij = bornRadius1*bornRadius2;
+                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
+                            real expTerm = EXP(-D_ij);
+                            real denominator2 = r2 + alpha2_ij*expTerm;
+                            real denominator = SQRT(denominator2);
+                            real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                            real Gpol = tempEnergy*RECIP(denominator2);
+                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                            force.w += dGpol_dalpha2_ij*bornRadius2;
+                            energy += tempEnergy;
+                            delta.xyz *= dEdR;
+                            force.xyz -= delta.xyz;
+                            localData[j].fx += delta.x;
+                            localData[j].fy += delta.y;
+                            localData[j].fz += delta.z;
+                            localData[j].fw += dGpol_dalpha2_ij*bornRadius1;
+                        }
+                    }
+                    // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+                    atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                    atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                    atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+#else
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+                    global_bornForce[offset] += force.w;
+#endif
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
+                    real4 force = 0;
+                    real4 posq1 = posq[atom1];
+                    float bornRadius1 = global_bornRadii[atom1];
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        int atom2 = atomIndices[j];
+#ifdef USE_CUTOFF
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+#else
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            real bornRadius2 = localData[j].bornRadius;
+                            real alpha2_ij = bornRadius1*bornRadius2;
+                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
+                            real expTerm = EXP(-D_ij);
+                            real denominator2 = r2 + alpha2_ij*expTerm;
+                            real denominator = SQRT(denominator2);
+                            real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                            real Gpol = tempEnergy*RECIP(denominator2);
+                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                            force.w += dGpol_dalpha2_ij*bornRadius2;
+                            energy += tempEnergy;
+                            delta.xyz *= dEdR;
+                            force.xyz -= delta.xyz;
+                            localData[j].fx += delta.x;
+                            localData[j].fy += delta.y;
+                            localData[j].fz += delta.z;
+                            localData[j].fw += dGpol_dalpha2_ij*bornRadius1;
+                        }
+                    }
+                    // Write results for atom1.
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
+                    atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
+                    atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                    atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+#else
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+                    global_bornForce[offset] += force.w;
+#endif
+                }
+            }
+            // Write results.
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef USE_CUTOFF
+                unsigned int atom2 = atomIndices[tgx];
+#else
+                unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+                if (atom2 < PADDED_NUM_ATOMS) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+                    atom_add(&forceBuffers[atom2], (long) (localData[tgx].fx*0x100000000));
+                    atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
+                    atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
+                    atom_add(&global_bornForce[atom2], (long) (localData[tgx].fw*0x100000000));
+#else
+                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    real4 f = forceBuffers[offset];
+                    f.x += localData[tgx].fx;
+                    f.y += localData[tgx].fy;
+                    f.z += localData[tgx].fz;
+                    forceBuffers[offset] = f;
+                    global_bornForce[offset] += localData[tgx].fw;
+#endif
+                }
            }
        }
-        lasty = y;
        pos++;
    }
    energyBuffer[get_global_id(0)] += energy;

--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-#define TILE_SIZE 32
-typedef struct {
-    real x, y, z;
-    float radius, scaledRadius;
-} AtomData1;
-/**
- * Compute the Born sum.
- */
-__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
-void computeBornSum(
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict global_bornSum,
-#else
-        __global real* restrict global_bornSum,
-#endif
-        __global const real4* restrict posq, __global const float2* restrict global_params,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles) {
-#else
-        unsigned int numTiles) {
-#endif
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-#else
-    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
-#endif
-    unsigned int lasty = 0xFFFFFFFF;
-    __local AtomData1 localData[TILE_SIZE];
-    __local real localBornSum[FORCE_WORK_GROUP_SIZE];
-    __local real localTemp[TILE_SIZE];
-    while (pos < end) {
-        // Extract the coordinates of this tile
-        unsigned int x, y;
-#ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
-#endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
-        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
-        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real bornSum = 0.0f;
-        real4 posq1 = posq[atom1];
-        float2 params1 = global_params[atom1];
-        if (x == y) {
-            // This tile is on the diagonal.
-            if (get_local_id(0) < TILE_SIZE) {
-                localData[get_local_id(0)].x = posq1.x;
-                localData[get_local_id(0)].y = posq1.y;
-                localData[get_local_id(0)].z = posq1.z;
-                localData[get_local_id(0)].radius = params1.x;
-                localData[get_local_id(0)].scaledRadius = params1.y;
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-                real4 delta = (real4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
-                real rScaledRadiusJ = r+params2.y;
-#ifdef USE_CUTOFF
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
-#else
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
-#endif
-                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
-                real u_ij = RECIP(rScaledRadiusJ);
-                real l_ij2 = l_ij*l_ij;
-                real u_ij2 = u_ij*u_ij;
-                real ratio = LOG(u_ij * RECIP(l_ij));
-                bornSum += (includeInteraction ? l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
-                                 (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2) : (real) 0);
-                bornSum += (includeInteraction && params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : (real) 0);
-            }
-            // Sum the forces and write results.
-            if (get_local_id(0) >= TILE_SIZE)
-                localTemp[tgx] = bornSum;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset = x*TILE_SIZE + tgx;
-                atom_add(&global_bornSum[offset], (long) ((bornSum + localTemp[tgx])*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                global_bornSum[offset] += bornSum + localTemp[tgx];
-#endif
-            }
-            // barrier not required here as localTemp is not accessed before encountering another barrier.
-        }
-        else {
-            // This is an off-diagonal tile.
-            if (lasty != y && get_local_id(0) < TILE_SIZE) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                real4 tempPosq = posq[j];
-                localData[get_local_id(0)].x = tempPosq.x;
-                localData[get_local_id(0)].y = tempPosq.y;
-                localData[get_local_id(0)].z = tempPosq.z;
-                float2 tempParams = global_params[j];
-                localData[get_local_id(0)].radius = tempParams.x;
-                localData[get_local_id(0)].scaledRadius = tempParams.y;
-            }
-            localBornSum[get_local_id(0)] = 0.0f;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            // Compute the full set of interactions in this tile.
-            unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-                real4 delta = (real4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
-#else
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
-#endif
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
-                real rScaledRadiusJ = r+params2.y;
-                {
-                    real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
-                    real u_ij = RECIP(rScaledRadiusJ);
-                    real l_ij2 = l_ij*l_ij;
-                    real u_ij2 = u_ij*u_ij;
-                    real ratio = LOG(u_ij * RECIP(l_ij));
-                    unsigned int includeTerm = (includeInteraction && params1.x < rScaledRadiusJ);
-                    bornSum += (includeTerm ? l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
-                                     (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2) : (real) 0);
-                    bornSum += (includeTerm && params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : (real) 0);
-                }
-                real rScaledRadiusI = r+params1.y;
-                {
-                    real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
-                    real u_ij = RECIP(rScaledRadiusI);
-                    real l_ij2 = l_ij*l_ij;
-                    real u_ij2 = u_ij*u_ij;
-                    real ratio = LOG(u_ij * RECIP(l_ij));
-                    real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
-                                     (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
-                    term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : (real) 0);
-                    localBornSum[tj+localForceOffset] += (includeInteraction && params2.x < rScaledRadiusI ? term : (real) 0);
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
-                tj = (tj+1) & (TILE_SIZE-1);
-            }
-            // Sum the forces and write results.
-            if (get_local_id(0) >= TILE_SIZE)
-                localTemp[tgx] = bornSum;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset1 = x*TILE_SIZE + tgx;
-                const unsigned int offset2 = y*TILE_SIZE + tgx;
-                atom_add(&global_bornSum[offset1], (long) ((bornSum + localTemp[tgx])*0x100000000));
-                atom_add(&global_bornSum[offset2], (long) ((localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE])*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                // Do both loads before both stores to minimize store-load waits.
-                real sum1 = global_bornSum[offset1];
-                real sum2 = global_bornSum[offset2];
-                sum1 += bornSum + localTemp[tgx];
-                sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
-                global_bornSum[offset1] = sum1;
-                global_bornSum[offset2] = sum2;
-#endif
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        lasty = y;
-        pos++;
-    }
-}
-typedef struct {
-    real x, y, z, w;
-    real padding;
-} PaddedUnalignedFloat4;
-typedef struct {
-    real x, y, z;
-    real q;
-    real bornRadius;
-    real temp_x, temp_y, temp_z, temp_w;
-} AtomData2;
-/**
- * First part of computing the GBSA interaction.
- */
-__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
-void computeGBSAForce1(
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers, __global long* restrict global_bornForce,
-#else
-        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
-#endif
-        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
-#ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles) {
-#else
-        unsigned int numTiles) {
-#endif
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
-#else
-    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
-    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
-#endif
-    real energy = 0.0f;
-    unsigned int lasty = 0xFFFFFFFF;
-    __local AtomData2 localData[TILE_SIZE];
-    __local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
-    while (pos < end) {
-        // Extract the coordinates of this tile
-        unsigned int x, y;
-#ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            ushort2 tileIndices = tiles[pos];
-            x = tileIndices.x;
-            y = tileIndices.y;
-        }
-        else
-#endif
-        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
-        }
-        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
-        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
-        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 force = 0.0f;
-        real4 posq1 = posq[atom1];
-        real bornRadius1 = global_bornRadii[atom1];
-        if (x == y) {
-            // This tile is on the diagonal.
-            if (get_local_id(0) < TILE_SIZE) {
-                localData[get_local_id(0)].x = posq1.x;
-                localData[get_local_id(0)].y = posq1.y;
-                localData[get_local_id(0)].z = posq1.z;
-                localData[get_local_id(0)].q = posq1.w;
-                localData[get_local_id(0)].bornRadius = bornRadius1;
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS);
-                real4 posq2 = (real4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q);
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                real bornRadius2 = localData[baseLocalAtom+j].bornRadius;
-                real alpha2_ij = bornRadius1*bornRadius2;
-                real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                real expTerm = EXP(-D_ij);
-                real denominator2 = r2 + alpha2_ij*expTerm;
-                real denominator = SQRT(denominator2);
-                real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
-                real Gpol = tempEnergy*RECIP(denominator2);
-                real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-#ifdef USE_CUTOFF
-                dEdR = (r2 > CUTOFF_SQUARED ? (real) 0 : dEdR);
-                tempEnergy = (r2 > CUTOFF_SQUARED ? (real) 0 : tempEnergy);
-                dGpol_dalpha2_ij = (r2 > CUTOFF_SQUARED ? (real) 0 : dGpol_dalpha2_ij);
-#endif
-                force.w += (includeInteraction ? dGpol_dalpha2_ij*bornRadius2 : (real) 0);
-                energy += (includeInteraction ? 0.5f*tempEnergy : (real) 0);
-                delta.xyz *= (includeInteraction ? dEdR : (real) 0);
-                force.xyz -= delta.xyz;
-            }
-            // Sum the forces and write results.
-            if (get_local_id(0) >= TILE_SIZE) {
-                localData[tgx].temp_x = force.x;
-                localData[tgx].temp_y = force.y;
-                localData[tgx].temp_z = force.z;
-                localData[tgx].temp_w = force.w;
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                const unsigned int offset = x*TILE_SIZE + tgx;
-                atom_add(&forceBuffers[offset], (long) ((force.x + localData[tgx].temp_x)*0x100000000));
-                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].temp_y)*0x100000000));
-                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].temp_z)*0x100000000));
-                atom_add(&global_bornForce[offset], (long) ((force.w + localData[tgx].temp_w)*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                // Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
-                real4 sum = forceBuffers[offset];
-                real global_sum = global_bornForce[offset];
-                sum.x += force.x + localData[tgx].temp_x;
-                sum.y += force.y + localData[tgx].temp_y;
-                sum.z += force.z + localData[tgx].temp_z;
-                global_sum += force.w + localData[tgx].temp_w;
-                forceBuffers[offset] = sum;
-                global_bornForce[offset] = global_sum;
-#endif
-            }
-            // barrier not required here as localData[*]/temp_* is not accessed before encountering another barrier.
-        }
-        else {
-            // This is an off-diagonal tile.
-            if (lasty != y && get_local_id(0) < TILE_SIZE) {
-                unsigned int j = y*TILE_SIZE + tgx;
-                real4 tempPosq = posq[j];
-                localData[get_local_id(0)].x = tempPosq.x;
-                localData[get_local_id(0)].y = tempPosq.y;
-                localData[get_local_id(0)].z = tempPosq.z;
-                localData[get_local_id(0)].q = tempPosq.w;
-                localData[get_local_id(0)].bornRadius = global_bornRadii[j];
-            }
-            localForce[get_local_id(0)].x = 0.0f;
-            localForce[get_local_id(0)].y = 0.0f;
-            localForce[get_local_id(0)].z = 0.0f;
-            localForce[get_local_id(0)].w = 0.0f;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            // Compute the full set of interactions in this tile.
-            unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
-            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
-                real4 posq2 = (real4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
-#ifdef USE_PERIODIC
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                real invR = RSQRT(r2);
-                real r = RECIP(invR);
-                real bornRadius2 = localData[tj].bornRadius;
-                real alpha2_ij = bornRadius1*bornRadius2;
-                real D_ij = r2*RECIP(4.0f*alpha2_ij);
-                real expTerm = EXP(-D_ij);
-                real denominator2 = r2 + alpha2_ij*expTerm;
-                real denominator = SQRT(denominator2);
-                real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
-                real Gpol = tempEnergy*RECIP(denominator2);
-                real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
-                real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-#ifdef USE_CUTOFF
-                dEdR = (r2 > CUTOFF_SQUARED ? (real) 0 : dEdR);
-                tempEnergy = (r2 > CUTOFF_SQUARED ? (real) 0 : tempEnergy);
-                dGpol_dalpha2_ij = (r2 > CUTOFF_SQUARED ? (real) 0 : dGpol_dalpha2_ij);
-#endif
-                force.w += (includeInteraction ? dGpol_dalpha2_ij*bornRadius2 : (real) 0);
-                energy += (includeInteraction ? tempEnergy : (real) 0);
-                delta.xyz *= (includeInteraction ? dEdR : (real) 0);
-                force.xyz -= delta.xyz;
-                localForce[tj+localForceOffset].x += delta.x;
-                localForce[tj+localForceOffset].y += delta.y;
-                localForce[tj+localForceOffset].z += delta.z;
-                localForce[tj+localForceOffset].w += (includeInteraction ? dGpol_dalpha2_ij*bornRadius1 : (real) 0);
-                barrier(CLK_LOCAL_MEM_FENCE);
-                tj = (tj+1) & (TILE_SIZE-1);
-            }
-            // Sum the forces and write results.
-            if (get_local_id(0) >= TILE_SIZE) {
-                localData[tgx].temp_x = force.x;
-                localData[tgx].temp_y = force.y;
-                localData[tgx].temp_z = force.z;
-                localData[tgx].temp_w = force.w;
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (get_local_id(0) < TILE_SIZE) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
-                barrier(CLK_LOCAL_MEM_FENCE);
-                const unsigned int offset1 = x*TILE_SIZE + tgx;
-                const unsigned int offset2 = y*TILE_SIZE + tgx;
-                atom_add(&forceBuffers[offset1], (long) ((force.x + localData[tgx].temp_x)*0x100000000));
-                atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].temp_y)*0x100000000));
-                atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].temp_z)*0x100000000));
-                atom_add(&global_bornForce[offset1], (long) ((force.w + localData[tgx].temp_w)*0x100000000));
-                atom_add(&forceBuffers[offset2], (long) ((localForce[get_local_id(0)].x + localForce[get_local_id(0)+TILE_SIZE].x)*0x100000000));
-                atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localForce[get_local_id(0)].y + localForce[get_local_id(0)+TILE_SIZE].y)*0x100000000));
-                atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localForce[get_local_id(0)].z + localForce[get_local_id(0)+TILE_SIZE].z)*0x100000000));
-                atom_add(&global_bornForce[offset2], (long) ((localForce[get_local_id(0)].w + localForce[get_local_id(0)+TILE_SIZE].w)*0x100000000));
-#else
-#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-                const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
-#else
-                const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-#endif
-                // Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
-                real4 sum1 = forceBuffers[offset1];
-                real4 sum2 = forceBuffers[offset2];
-                real global_sum1 = global_bornForce[offset1];
-                real global_sum2 = global_bornForce[offset2];
-                sum1.x += force.x + localData[tgx].temp_x;
-                sum1.y += force.y + localData[tgx].temp_y;
-                sum1.z += force.z + localData[tgx].temp_z;
-                global_sum1 += force.w + localData[tgx].temp_w;
-                sum2.x += localForce[get_local_id(0)].x + localForce[get_local_id(0)+TILE_SIZE].x;
-                sum2.y += localForce[get_local_id(0)].y + localForce[get_local_id(0)+TILE_SIZE].y;
-                sum2.z += localForce[get_local_id(0)].z + localForce[get_local_id(0)+TILE_SIZE].z;
-                global_sum2 += localForce[get_local_id(0)].w + localForce[get_local_id(0)+TILE_SIZE].w;
-                forceBuffers[offset1] = sum1;
-                forceBuffers[offset2] = sum2;
-                global_bornForce[offset1] = global_sum1;
-                global_bornForce[offset2] = global_sum2;
-#endif
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        lasty = y;
-        pos++;
-    }
-    energyBuffer[get_global_id(0)] += energy;
-}