Created OpenCL implementation of GBSAOBCForce that works with non-Nvidia processors

95e059dc · Peter Eastman · f0cab877 · 95e059dc · 95e059dc · 95e059dc
Commit 95e059dc authored Oct 28, 2009 by Peter Eastman
7 changed files
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -853,28 +853,15 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
            defines["USE_CUTOFF"] = "1";
        if (nb.getUsePeriodic())
            defines["USE_PERIODIC"] = "1";
-        stringstream xsize, ysize, zsize, cutoffSquared, prefac;
-        xsize.precision(8);
-        ysize.precision(8);
-        zsize.precision(8);
-        cutoffSquared.precision(8);
-        prefac.precision(8);
-        xsize << scientific << nb.getPeriodicBoxSize().x << "f";
-        ysize << scientific << nb.getPeriodicBoxSize().y << "f";
-        zsize << scientific << nb.getPeriodicBoxSize().z << "f";
-        cutoffSquared << scientific << (nb.getCutoffDistance()*nb.getCutoffDistance()) << "f";
-        prefac << scientific << prefactor << "f";
-        defines["PERIODIC_BOX_SIZE_X"] = xsize.str();
-        defines["PERIODIC_BOX_SIZE_Y"] = ysize.str();
-        defines["PERIODIC_BOX_SIZE_Z"] = zsize.str();
-        defines["CUTOFF_SQUARED"] = cutoffSquared.str();
-        defines["PREFACTOR"] = prefac.str();
-        stringstream natom, padded;
-        natom << cl.getNumAtoms();
-        padded << cl.getPaddedNumAtoms();
-        defines["NUM_ATOMS"] = natom.str();
-        defines["PADDED_NUM_ATOMS"] = padded.str();
-        cl::Program program = cl.createProgram(cl.loadSourceFromFile("gbsaObc.cl"), defines);
+        defines["PERIODIC_BOX_SIZE_X"] = doubleToString(nb.getPeriodicBoxSize().x);
+        defines["PERIODIC_BOX_SIZE_Y"] = doubleToString(nb.getPeriodicBoxSize().y);
+        defines["PERIODIC_BOX_SIZE_Z"] = doubleToString(nb.getPeriodicBoxSize().z);
+        defines["CUTOFF_SQUARED"] = doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
+        defines["PREFACTOR"] = doubleToString(prefactor);
+        defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
+        defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
+        string filename = (cl.getSIMDWidth() == 32 ? "gbsaObc_nvidia.cl" : "gbsaObc_default.cl");
+        cl::Program program = cl.createProgram(cl.loadSourceFromFile(filename), defines);
        computeBornSumKernel = cl::Kernel(program, "computeBornSum");
        computeBornSumKernel.setArg<cl::Buffer>(0, bornSum->getDeviceBuffer());
        computeBornSumKernel.setArg(1, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
@@ -882,26 +869,16 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
        computeBornSumKernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
        computeBornSumKernel.setArg<cl::Buffer>(4, params->getDeviceBuffer());
        computeBornSumKernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float2), NULL);
+        computeBornSumKernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
        if (nb.getUseCutoff()) {
-            computeBornSumKernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
-            computeBornSumKernel.setArg<cl::Buffer>(7, nb.getInteractionFlags().getDeviceBuffer());
-            computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionCount().getDeviceBuffer());
-            computeBornSumKernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
+            computeBornSumKernel.setArg<cl::Buffer>(7, nb.getInteractingTiles().getDeviceBuffer());
+            computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer());
+            computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionCount().getDeviceBuffer());
        }
        else {
-            computeBornSumKernel.setArg<cl::Buffer>(6, nb.getTiles().getDeviceBuffer());
-            computeBornSumKernel.setArg<cl_uint>(7, nb.getTiles().getSize());
+            computeBornSumKernel.setArg<cl::Buffer>(7, nb.getTiles().getDeviceBuffer());
+            computeBornSumKernel.setArg<cl_uint>(8, nb.getTiles().getSize());
        }
-        reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
-        reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
-        reduceBornSumKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
-        reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
-        reduceBornSumKernel.setArg<cl_float>(3, 0.8f);
-        reduceBornSumKernel.setArg<cl_float>(4, 4.85f);
-        reduceBornSumKernel.setArg<cl::Buffer>(5, bornSum->getDeviceBuffer());
-        reduceBornSumKernel.setArg<cl::Buffer>(6, params->getDeviceBuffer());
-        reduceBornSumKernel.setArg<cl::Buffer>(7, bornRadii->getDeviceBuffer());
-        reduceBornSumKernel.setArg<cl::Buffer>(8, obcChain->getDeviceBuffer());
        force1Kernel = cl::Kernel(program, "computeGBSAForce1");
        force1Kernel.setArg<cl::Buffer>(0, cl.getForceBuffers().getDeviceBuffer());
        force1Kernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
@@ -912,16 +889,27 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
        force1Kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
        force1Kernel.setArg<cl::Buffer>(7, bornForce->getDeviceBuffer());
        force1Kernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
+        force1Kernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
        if (nb.getUseCutoff()) {
-            force1Kernel.setArg<cl::Buffer>(9, nb.getInteractingTiles().getDeviceBuffer());
-            force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
-            force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionCount().getDeviceBuffer());
-            force1Kernel.setArg(12, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
+            force1Kernel.setArg<cl::Buffer>(10, nb.getInteractingTiles().getDeviceBuffer());
+            force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer());
+            force1Kernel.setArg<cl::Buffer>(12, nb.getInteractionCount().getDeviceBuffer());
        }
        else {
-            force1Kernel.setArg<cl::Buffer>(9, nb.getTiles().getDeviceBuffer());
-            force1Kernel.setArg<cl_uint>(10, nb.getTiles().getSize());
+            force1Kernel.setArg<cl::Buffer>(10, nb.getTiles().getDeviceBuffer());
+            force1Kernel.setArg<cl_uint>(11, nb.getTiles().getSize());
        }
+        program = cl.createProgram(cl.loadSourceFromFile("gbsaObcReductions.cl"), defines);
+        reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
+        reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
+        reduceBornSumKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
+        reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
+        reduceBornSumKernel.setArg<cl_float>(3, 0.8f);
+        reduceBornSumKernel.setArg<cl_float>(4, 4.85f);
+        reduceBornSumKernel.setArg<cl::Buffer>(5, bornSum->getDeviceBuffer());
+        reduceBornSumKernel.setArg<cl::Buffer>(6, params->getDeviceBuffer());
+        reduceBornSumKernel.setArg<cl::Buffer>(7, bornRadii->getDeviceBuffer());
+        reduceBornSumKernel.setArg<cl::Buffer>(8, obcChain->getDeviceBuffer());
        reduceBornForceKernel = cl::Kernel(program, "reduceBornForce");
        reduceBornForceKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
        reduceBornForceKernel.setArg<cl_int>(1, cl.getNumForceBuffers());

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -300,7 +300,7 @@ private:
 class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform), cl(cl),
-            exceptionParams(NULL), exceptionIndices(NULL) {
+            sigmaEpsilon(NULL), exceptionParams(NULL), exceptionIndices(NULL), cosSinSums(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
    /**

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -107,6 +107,14 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    if (cutoff == -1.0)
        return; // There are no nonbonded interactions in the System.
    
+    if (atomExclusions.size() == 0) {
+        // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
+        
+        atomExclusions.resize(context.getNumAtoms());
+        for (int i = 0; i < atomExclusions.size(); i++)
+            atomExclusions[i].push_back(i);
+    }
+
    // Create the list of tiles.

    int numAtomBlocks = context.getNumAtomBlocks();

--- a/platforms/opencl/src/kernels/gbsaObcReductions.cl
+++ b/platforms/opencl/src/kernels/gbsaObcReductions.cl
+const float dielectricOffset = 0.009f;
+const float probeRadius = 0.14f;
+const float surfaceAreaFactor = -6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
+
+/**
+ * Reduce the Born sums to compute the Born radii.
+ */
+
+__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
+            __global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) {
+    unsigned int index = get_global_id(0);
+    while (index < NUM_ATOMS) {
+        // Get summed Born data
+
+        int totalSize = bufferSize*numBuffers;
+        float sum = bornSum[index];
+        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
+            sum += bornSum[i];
+
+        // Now calculate Born radius and OBC term.
+
+        float offsetRadius = params[index].x;
+        sum *= 0.5f*offsetRadius;
+        float sum2 = sum*sum;
+        float sum3 = sum*sum2;
+        float tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
+        float nonOffsetRadius = offsetRadius + dielectricOffset;
+        float radius = 1.0f/(1.0f/offsetRadius - tanhSum/nonOffsetRadius);
+        float chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2);
+        chain = (1.0f-tanhSum*tanhSum)*chain / nonOffsetRadius;
+        bornRadii[index] = radius;
+        obcChain[index] = chain;
+        index += get_global_size(0);
+    }
+}
+
+/**
+ * Reduce the Born force.
+ */
+
+__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
+            __global float2* params, __global float* bornRadii, __global float* obcChain) {
+    float energy = 0.0f;
+    unsigned int index = get_global_id(0);
+    while (index < NUM_ATOMS) {
+        // Sum the Born force
+
+        int totalSize = bufferSize*numBuffers;
+        float force = bornForce[index];
+        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
+            force += bornForce[i];
+
+        // Now calculate the actual force
+
+        float offsetRadius = params[index].x;
+        float bornRadius = bornRadii[index];
+        float r = offsetRadius+dielectricOffset+probeRadius;
+        float ratio6 = pow((offsetRadius+dielectricOffset)/bornRadius, 6.0f);
+        float saTerm = surfaceAreaFactor*r*r*ratio6;
+        force += saTerm/bornRadius;
+        energy += saTerm;
+        force *= bornRadius*bornRadius*obcChain[index];
+        bornForce[index] = force;
+        index += get_global_size(0);
+    }
+    energyBuffer[get_global_id(0)] += energy/-6.0f;
+}
\ No newline at end of file
--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
+#define TILE_SIZE 32
+
+/**
+ * Compute the Born sum.
+ */
+
+__kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
+        __local float4* local_posq, __global float2* global_params, __local float2* local_params, __local float* tempBuffer, __global unsigned int* tiles,
+#ifdef USE_CUTOFF
+        __global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
+#else
+        unsigned int numTiles) {
+#endif
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+#endif
+    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
+    float energy = 0.0f;
+    unsigned int lasty = 0xFFFFFFFF;
+
+    while (pos < end) {
+        // Extract the coordinates of this tile
+        unsigned int x = tiles[pos];
+        unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
+        x = (x>>17)*TILE_SIZE;
+        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
+        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
+        unsigned int atom1 = x + tgx;
+        float bornSum = 0.0f;
+        float4 posq1 = posq[atom1];
+        float2 params1 = global_params[atom1];
+        if (x == y) {
+            // This tile is on the diagonal.
+
+            local_posq[get_local_id(0)] = posq1;
+            local_params[get_local_id(0)] = params1;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            unsigned int xi = x/TILE_SIZE;
+            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
+            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
+                float4 delta = (float4) (local_posq[baseLocalAtom+j].xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
+                delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
+                delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
+#endif
+                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+#else
+                if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS) {
+#endif
+                    float r = sqrt(r2);
+                    float invR = 1.0f/r;
+                    float2 params2 = local_params[baseLocalAtom+j];
+                    float rScaledRadiusJ = r+params2.y;
+                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
+                        float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
+                        float u_ij = 1.0f/rScaledRadiusJ;
+                        float l_ij2 = l_ij*l_ij;
+                        float u_ij2 = u_ij*u_ij;
+                        float ratio = log(u_ij / l_ij);
+                        bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                                         (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
+                        if (params1.x < params2.x-r)
+                            bornSum += 2.0f*(1.0f/params1.x-l_ij);
+                    }
+                }
+            }
+
+            // Sum the forces and write results.
+
+            if (get_local_id(0) >= TILE_SIZE)
+                tempBuffer[get_local_id(0)] = bornSum;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) < TILE_SIZE) {
+#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
+                unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
+#else
+                unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#endif
+                global_bornSum[offset] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+
+            if (lasty != y && get_local_id(0) < TILE_SIZE) {
+                unsigned int j = y + tgx;
+                local_posq[get_local_id(0)] = posq[j];
+                local_params[get_local_id(0)] = global_params[j];
+            }
+            local_bornSum[get_local_id(0)] = 0.0f;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            // Compute the full set of interactions in this tile.
+
+            unsigned int xi = x/TILE_SIZE;
+            unsigned int yi = y/TILE_SIZE;
+            unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
+            unsigned int tj = tgx%(TILE_SIZE/2);
+            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
+                float4 delta = (float4) (local_posq[baseLocalAtom+tj].xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
+                delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
+                delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
+#endif
+                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+#else
+                if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS) {
+#endif
+                    float r = sqrt(r2);
+                    float invR = 1.0f/r;
+                    float2 params2 = local_params[baseLocalAtom+tj];
+                    float rScaledRadiusJ = r+params2.y;
+                    if (params1.x < rScaledRadiusJ) {
+                        float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
+                        float u_ij = 1.0f/rScaledRadiusJ;
+                        float l_ij2 = l_ij*l_ij;
+                        float u_ij2 = u_ij*u_ij;
+                        float ratio = log(u_ij / l_ij);
+                        bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                                         (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
+                        if (params1.x < params2.x-r)
+                            bornSum += 2.0f*(1.0f/params1.x-l_ij);
+                    }
+                    float rScaledRadiusI = r+params1.y;
+                    if (params2.x < rScaledRadiusJ) {
+                        float l_ij = 1.0f/max(params2.x, fabs(r-params1.y));
+                        float u_ij = 1.0f/rScaledRadiusJ;
+                        float l_ij2 = l_ij*l_ij;
+                        float u_ij2 = u_ij*u_ij;
+                        float ratio = log(u_ij / l_ij);
+                        float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
+                                         (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
+                        if (params2.x < params1.x-r)
+                            term += 2.0f*(1.0f/params2.x-l_ij);
+                        local_bornSum[baseLocalAtom+tj+forceBufferOffset] += term;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                tj = (tj+1)%(TILE_SIZE/2);
+            }
+
+            // Sum the forces and write results.
+
+            if (get_local_id(0) >= TILE_SIZE)
+                tempBuffer[get_local_id(0)] = bornSum;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) < TILE_SIZE) {
+#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
+                unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
+                unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
+#else
+                unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#endif
+                global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
+                global_bornSum[offset2] += local_bornSum[get_local_id(0)]+local_bornSum[get_local_id(0)+TILE_SIZE];
+            }
+            lasty = y;
+        }
+        pos++;
+    }
+}
+
+/**
+ * First part of computing the GBSA interaction.
+ */
+
+__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
+        __global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii,
+        __global float* global_bornForce, __local float* local_bornForce, __local float4* tempBuffer, __global unsigned int* tiles,
+#ifdef USE_CUTOFF
+        __global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
+#else
+        unsigned int numTiles) {
+#endif
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+#endif
+    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
+    float energy = 0.0f;
+    unsigned int lasty = 0xFFFFFFFF;
+
+    while (pos < end) {
+        // Extract the coordinates of this tile
+        unsigned int x = tiles[pos];
+        unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
+        x = (x>>17)*TILE_SIZE;
+        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
+        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
+        unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
+        unsigned int atom1 = x + tgx;
+        float4 force = 0.0f;
+        float4 posq1 = posq[atom1];
+        float bornRadius1 = global_bornRadii[atom1];
+        if (x == y) {
+            // This tile is on the diagonal.
+
+            local_posq[get_local_id(0)] = posq1;
+            local_bornRadii[get_local_id(0)] = bornRadius1;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            unsigned int xi = x/TILE_SIZE;
+            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
+            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
+                if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS) {
+                    float4 posq2 = local_posq[baseLocalAtom+j];
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
+                    delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
+                    delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
+#endif
+                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    float r = sqrt(r2);
+                    float invR = 1.0f/r;
+                    float bornRadius2 = local_bornRadii[baseLocalAtom+j];
+                    float alpha2_ij = bornRadius1*bornRadius2;
+                    float D_ij = r2/(4.0f*alpha2_ij);
+                    float expTerm = exp(-D_ij);
+                    float denominator2 = r2 + alpha2_ij*expTerm;
+                    float denominator = sqrt(denominator2);
+                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
+                    float Gpol = tempEnergy/denominator2;
+                    float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                    force.w += dGpol_dalpha2_ij*bornRadius2;
+                    float dEdR = Gpol*(1.0f - 0.25f*expTerm);
+#ifdef USE_CUTOFF
+                    if (r2 > CUTOFF_SQUARED) {
+                        dEdR = 0.0f;
+                        tempEnergy  = 0.0f;
+                    }
+#endif
+                    energy += 0.5f*tempEnergy;
+                    delta.xyz *= dEdR;
+                    force.xyz -= delta.xyz;
+                }
+            }
+
+            // Sum the forces and write results.
+
+            if (get_local_id(0) >= TILE_SIZE)
+                tempBuffer[get_local_id(0)] = force;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) < TILE_SIZE) {
+#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
+                unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
+#else
+                unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#endif
+                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
+                global_bornForce[offset] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+
+            if (lasty != y && get_local_id(0) < TILE_SIZE) {
+                unsigned int j = y + tgx;
+                local_posq[get_local_id(0)] = posq[j];
+                local_bornRadii[get_local_id(0)] = global_bornRadii[j];
+            }
+            local_force[get_local_id(0)] = 0.0f;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            // Compute the full set of interactions in this tile.
+
+            unsigned int xi = x/TILE_SIZE;
+            unsigned int yi = y/TILE_SIZE;
+            unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
+            unsigned int tj = tgx%(TILE_SIZE/2);
+            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
+                if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS) {
+                    float4 posq2 = local_posq[baseLocalAtom+tj];
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
+                    delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
+                    delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
+#endif
+                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    float r = sqrt(r2);
+                    float invR = 1.0f/r;
+                    float bornRadius2 = local_bornRadii[baseLocalAtom+tj];
+                    float alpha2_ij = bornRadius1*bornRadius2;
+                    float D_ij = r2/(4.0f*alpha2_ij);
+                    float expTerm = exp(-D_ij);
+                    float denominator2 = r2 + alpha2_ij*expTerm;
+                    float denominator = sqrt(denominator2);
+                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
+                    float Gpol = tempEnergy/denominator2;
+                    float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                    force.w += dGpol_dalpha2_ij*bornRadius2;
+                    float dEdR = Gpol*(1.0f - 0.25f*expTerm);
+#ifdef USE_CUTOFF
+                    if (r2 > CUTOFF_SQUARED) {
+                        dEdR = 0.0f;
+                        tempEnergy  = 0.0f;
+                    }
+#endif
+                    energy += tempEnergy;
+                    delta.xyz *= dEdR;
+                    force.xyz -= delta.xyz;
+                    local_force[baseLocalAtom+tj+forceBufferOffset] += (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                tj = (tj+1)%(TILE_SIZE/2);
+            }
+
+            // Sum the forces and write results.
+
+            if (get_local_id(0) >= TILE_SIZE)
+                tempBuffer[get_local_id(0)] = force;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) < TILE_SIZE) {
+#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
+                unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
+                unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
+#else
+                unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#endif
+                forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
+                forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
+                global_bornForce[offset1] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
+                global_bornForce[offset2] += local_force[get_local_id(0)].w+local_force[get_local_id(0)+TILE_SIZE].w;
+            }
+            lasty = y;
+        }
+        pos++;
+    }
+    energyBuffer[get_global_id(0)] += energy;
+}
--- a/platforms/opencl/src/kernels/gbsaObc.cl
+++ b/platforms/opencl/src/kernels/gbsaObc.cl
-const unsigned int TileSize = 32;
-const float dielectricOffset = 0.009f;
-const float probeRadius = 0.14f;
-const float surfaceAreaFactor = -6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
+#define TILE_SIZE 32

 /**
 * Compute the Born sum.
 */

 __kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
-        __local float4* local_posq, __global float2* global_params, __local float2* local_params, __global unsigned int* tiles,
+        __local float4* local_posq, __global float2* global_params, __local float2* local_params, __local float* tempBuffer, __global unsigned int* tiles,
 #ifdef USE_CUTOFF
-        __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float* tempBuffer) {
+        __global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
 #else
        unsigned int numTiles) {
 #endif
 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
 #endif
-    unsigned int totalWarps = get_global_size(0)/TileSize;
-    unsigned int warp = get_global_id(0)/TileSize;
+    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    unsigned int warp = get_global_id(0)/TILE_SIZE;
    unsigned int pos = warp*numTiles/totalWarps;
    unsigned int end = (warp+1)*numTiles/totalWarps;
    float energy = 0.0f;
@@ -27,11 +24,10 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
    while (pos < end) {
        // Extract the coordinates of this tile
        unsigned int x = tiles[pos];
-        unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
-        x = (x>>17)*TileSize;
-        unsigned int tgx = get_local_id(0) & (TileSize-1);
+        unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
+        x = (x>>17)*TILE_SIZE;
+        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
        unsigned int tbx = get_local_id(0) - tgx;
-        unsigned int tj = tgx;
        unsigned int atom1 = x + tgx;
        float bornSum = 0.0f;
        float4 posq1 = posq[atom1];
@@ -41,9 +37,9 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca

            local_posq[get_local_id(0)] = posq1;
            local_params[get_local_id(0)] = params1;
-            unsigned int xi = x/TileSize;
-            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
-            for (unsigned int j = 0; j < TileSize; j++) {
+            unsigned int xi = x/TILE_SIZE;
+            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
@@ -76,7 +72,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca

            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-            unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
+            unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
 #else
            unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
 #endif
@@ -100,7 +96,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
                else {
                    // Compute only a subset of the interactions in this tile.

-                    for (unsigned int j = 0; j < TileSize; j++) {
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
                        if ((flags&(1<<j)) != 0) {
                            float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
@@ -166,10 +162,11 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
            {
                // Compute the full set of interactions in this tile.

-                unsigned int xi = x/TileSize;
-                unsigned int yi = y/TileSize;
-                unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
-                for (unsigned int j = 0; j < TileSize; j++) {
+                unsigned int xi = x/TILE_SIZE;
+                unsigned int yi = y/TILE_SIZE;
+                unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
+                unsigned int tj = tgx;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
                    float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                    delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
@@ -211,15 +208,15 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
                            local_bornSum[tbx+tj] += term;
                        }
                    }
-                    tj = (tj + 1) & (TileSize - 1);
+                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }

            // Write results
            float4 of;
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-            unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
-            unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
+            unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
+            unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
 #else
            unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
            unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
@@ -232,55 +229,23 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
    }
 }

-/**
- * Reduce the Born sums to compute the Born radii.
- */
-
-__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
-            __global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) {
-    unsigned int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-        // Get summed Born data
-
-        int totalSize = bufferSize*numBuffers;
-        float sum = bornSum[index];
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            sum += bornSum[i];
-
-        // Now calculate Born radius and OBC term.
-
-        float offsetRadius = params[index].x;
-        sum *= 0.5f*offsetRadius;
-        float sum2 = sum*sum;
-        float sum3 = sum*sum2;
-        float tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
-        float nonOffsetRadius = offsetRadius + dielectricOffset;
-        float radius = 1.0f/(1.0f/offsetRadius - tanhSum/nonOffsetRadius);
-        float chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2);
-        chain = (1.0f-tanhSum*tanhSum)*chain / nonOffsetRadius;
-        bornRadii[index] = radius;
-        obcChain[index] = chain;
-        index += get_global_size(0);
-    }
-}
-
 /**
 * First part of computing the GBSA interaction.
 */

 __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
        __global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii,
-        __global float* global_bornForce, __local float* local_bornForce, __global unsigned int* tiles,
+        __global float* global_bornForce, __local float* local_bornForce, __local float4* tempBuffer, __global unsigned int* tiles,
 #ifdef USE_CUTOFF
-        __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer) {
+        __global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
 #else
        unsigned int numTiles) {
 #endif
 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
 #endif
-    unsigned int totalWarps = get_global_size(0)/TileSize;
-    unsigned int warp = get_global_id(0)/TileSize;
+    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    unsigned int warp = get_global_id(0)/TILE_SIZE;
    unsigned int pos = warp*numTiles/totalWarps;
    unsigned int end = (warp+1)*numTiles/totalWarps;
    float energy = 0.0f;
@@ -289,11 +254,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
    while (pos < end) {
        // Extract the coordinates of this tile
        unsigned int x = tiles[pos];
-        unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
-        x = (x>>17)*TileSize;
-        unsigned int tgx = get_local_id(0) & (TileSize-1);
+        unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
+        x = (x>>17)*TILE_SIZE;
+        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
        unsigned int tbx = get_local_id(0) - tgx;
-        unsigned int tj = tgx;
        unsigned int atom1 = x + tgx;
        float4 force = 0.0f;
        float4 posq1 = posq[atom1];
@@ -303,11 +267,12 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e

            local_posq[get_local_id(0)] = posq1;
            local_bornRadii[get_local_id(0)] = bornRadius1;
-            unsigned int xi = x/TileSize;
-            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
-            for (unsigned int j = 0; j < TileSize; j++) {
+            unsigned int xi = x/TILE_SIZE;
+            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
-                    float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
+                    float4 posq2 = local_posq[tbx+j];
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                    delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
                    delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
@@ -316,7 +281,6 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    float r = sqrt(r2);
                    float invR = 1.0f/r;
-                    float4 posq2 = local_posq[tbx+j];
                    float bornRadius2 = local_bornRadii[tbx+j];
                    float alpha2_ij = bornRadius1*bornRadius2;
                    float D_ij = r2/(4.0f*alpha2_ij);
@@ -342,7 +306,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e

            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-            unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
+            unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
 #else
            unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
 #endif
@@ -367,9 +331,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                else {
                    // Compute only a subset of the interactions in this tile.

-                    for (unsigned int j = 0; j < TileSize; j++) {
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
                        if ((flags&(1<<j)) != 0) {
-                            float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
+                            float4 posq2 = local_posq[tbx+j];
+                            float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                            delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
                            delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
@@ -377,8 +342,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
 #endif
                            float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                            float r = sqrt(r2);
-                            float invR = 1.0f / r;
-                            float4 posq2 = local_posq[tbx+j];
+                            float invR = 1.0f/r;
                            float bornRadius2 = local_bornRadii[tbx+j];
                            float alpha2_ij = bornRadius1*bornRadius2;
                            float D_ij = r2/(4.0f*alpha2_ij);
@@ -424,12 +388,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
            {
                // Compute the full set of interactions in this tile.

-                unsigned int xi = x/TileSize;
-                unsigned int yi = y/TileSize;
-                unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
-                for (unsigned int j = 0; j < TileSize; j++) {
+                unsigned int xi = x/TILE_SIZE;
+                unsigned int yi = y/TILE_SIZE;
+                unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
+                unsigned int tj = tgx;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
                    if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
-                        float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
+                        float4 posq2 = local_posq[tbx+tj];
+                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                        delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
                        delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
@@ -437,8 +403,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
 #endif
                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        float r = sqrt(r2);
-                        float invR = 1.0f / r;
-                        float4 posq2 = local_posq[tbx+tj];
+                        float invR = 1.0f/r;
                        float bornRadius2 = local_bornRadii[tbx+tj];
                        float alpha2_ij = bornRadius1*bornRadius2;
                        float D_ij = r2/(4.0f*alpha2_ij);
@@ -461,14 +426,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                        force.xyz -= delta.xyz;
                        local_force[tbx+tj] += (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
                    }
-                    tj = (tj + 1) & (TileSize - 1);
+                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }

            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-            unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
-            unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
+            unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
+            unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
 #else
            unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
            unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
@@ -483,35 +448,3 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
    }
    energyBuffer[get_global_id(0)] += energy;
 }
-
-/**
- * Reduce the Born force.
- */
-
-__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
-            __global float2* params, __global float* bornRadii, __global float* obcChain) {
-    float energy = 0.0f;
-    unsigned int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-        // Sum the Born force
-
-        int totalSize = bufferSize*numBuffers;
-        float force = bornForce[index];
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            force += bornForce[i];
-
-        // Now calculate the actual force
-
-        float offsetRadius = params[index].x;
-        float bornRadius = bornRadii[index];
-        float r = offsetRadius+dielectricOffset+probeRadius;
-        float ratio6 = pow((offsetRadius+dielectricOffset)/bornRadius, 6.0f);
-        float saTerm = surfaceAreaFactor*r*r*ratio6;
-        force += saTerm/bornRadius;
-        energy += saTerm;
-        force *= bornRadius*bornRadius*obcChain[index];
-        bornForce[index] = force;
-        index += get_global_size(0);
-    }
-    energyBuffer[get_global_id(0)] += energy/-6.0f;
-}
--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
-const unsigned int TileSize = 32;
+#define TILE_SIZE 32

 /**
 * Compute nonbonded interactions.
@@ -15,8 +15,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
 #endif
-    unsigned int totalWarps = get_global_size(0)/TileSize;
-    unsigned int warp = get_global_id(0)/TileSize;
+    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    unsigned int warp = get_global_id(0)/TILE_SIZE;
    unsigned int pos = warp*numTiles/totalWarps;
    unsigned int end = (warp+1)*numTiles/totalWarps;
    float energy = 0.0f;
@@ -25,10 +25,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
    while (pos < end) {
        // Extract the coordinates of this tile
        unsigned int x = tiles[pos];
-        unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
+        unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
        bool hasExclusions = (x & 0x1);
-        x = (x>>17)*TileSize;
-        unsigned int tgx = get_local_id(0) & (TileSize-1);
+        x = (x>>17)*TILE_SIZE;
+        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
        unsigned int tbx = get_local_id(0) - tgx;
        unsigned int atom1 = x + tgx;
        float4 force = 0.0f;
@@ -39,12 +39,12 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en

            local_posq[get_local_id(0)] = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
-            unsigned int xi = x/TileSize;
-            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
+            unsigned int xi = x/TILE_SIZE;
+            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
 #ifdef USE_EXCLUSIONS
            unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
 #endif
-            for (unsigned int j = 0; j < TileSize; j++) {
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
                bool isExcluded = !(excl & 0x1);
 #endif
@@ -72,7 +72,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en

            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-            unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
+            unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
 #else
            unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
 #endif
@@ -96,7 +96,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                else {
                    // Compute only a subset of the interactions in this tile.

-                    for (unsigned int j = 0; j < TileSize; j++) {
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
                        if ((flags&(1<<j)) != 0) {
                            bool isExcluded = false;
                            int atom2 = tbx+j;
@@ -141,15 +141,15 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
            {
                // Compute the full set of interactions in this tile.

-                unsigned int xi = x/TileSize;
-                unsigned int yi = y/TileSize;
-                unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
+                unsigned int xi = x/TILE_SIZE;
+                unsigned int yi = y/TILE_SIZE;
+                unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
 #ifdef USE_EXCLUSIONS
                unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
-                excl = (excl >> tgx) | (excl << (TileSize - tgx));
+                excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
 #endif
                unsigned int tj = tgx;
-                for (unsigned int j = 0; j < TileSize; j++) {
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
@@ -174,14 +174,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                    force.xyz -= delta.xyz;
                    local_force[tbx+tj].xyz += delta.xyz;
                    excl >>= 1;
-                    tj = (tj + 1) & (TileSize - 1);
+                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }

            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
-            unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
-            unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
+            unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
+            unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
 #else
            unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
            unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;