Created CustomGBForce kernels optimized for CPU. Fixed bugs in CPU GBSA kernels.

562cfb39 · Peter Eastman · cbfff447 · 562cfb39 · 562cfb39 · 562cfb39
Commit 562cfb39 authored Nov 17, 2010 by Peter Eastman
8 changed files
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1715,7 +1715,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
        computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*13*sizeof(cl_float), NULL);
-        computeBornSumKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
+        computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : OpenCLContext::ThreadBlockSize)*sizeof(cl_float), NULL);
        if (nb.getUseCutoff()) {
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1734,7 +1734,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
        force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
        force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*13*sizeof(cl_float), NULL);
-        force1Kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
+        force1Kernel.setArg(index++, (deviceIsCpu ? 1 : OpenCLContext::ThreadBlockSize)*sizeof(mm_float4), NULL);
        if (nb.getUseCutoff()) {
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1954,6 +1954,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo

    bool useCutoff = (force.getNonbondedMethod() != CustomGBForce::NoCutoff);
    bool usePeriodic = (force.getNonbondedMethod() != CustomGBForce::NoCutoff && force.getNonbondedMethod() != CustomGBForce::CutoffNonPeriodic);
+    bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    {
        // Create the N2 value kernel.

@@ -1987,8 +1988,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
            const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
            string paramName = "params"+intToString(i+1);
            extraArgs << ", __global " << buffer.getType() << "* global_" << paramName << ", __local " << buffer.getType() << "* local_" << paramName;
-            loadLocal1 << "local_" << paramName << "[get_local_id(0)] = " << paramName << "1;\n";
-            loadLocal2 << "local_" << paramName << "[get_local_id(0)] = global_" << paramName << "[j];\n";
+            loadLocal1 << "local_" << paramName << "[localAtomIndex] = " << paramName << "1;\n";
+            loadLocal2 << "local_" << paramName << "[localAtomIndex] = global_" << paramName << "[j];\n";
            load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
            load2 << buffer.getType() << " " << paramName << "2 = local_" << paramName << "[atom2];\n";
        }
@@ -2010,7 +2011,13 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
        defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
        defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
-        string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBValueN2_nvidia : OpenCLKernelSources::customGBValueN2_default);
+        string file;
+        if (deviceIsCpu)
+            file = OpenCLKernelSources::customGBValueN2_cpu;
+        else if (cl.getSIMDWidth() == 32)
+            file = OpenCLKernelSources::customGBValueN2_nvidia;
+        else
+            OpenCLKernelSources::customGBValueN2_default;
        cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
        pairValueKernel = cl::Kernel(program, "computeN2Value");
    }
@@ -2106,8 +2113,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
            const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
            string paramName = "params"+intToString(i+1);
            extraArgs << ", __global " << buffer.getType() << "* global_" << paramName << ", __local " << buffer.getType() << "* local_" << paramName;
-            loadLocal1 << "local_" << paramName << "[get_local_id(0)] = " << paramName << "1;\n";
-            loadLocal2 << "local_" << paramName << "[get_local_id(0)] = global_" << paramName << "[j];\n";
+            loadLocal1 << "local_" << paramName << "[localAtomIndex] = " << paramName << "1;\n";
+            loadLocal2 << "local_" << paramName << "[localAtomIndex] = global_" << paramName << "[j];\n";
            load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
            load2 << buffer.getType() << " " << paramName << "2 = local_" << paramName << "[atom2];\n";
        }
@@ -2115,8 +2122,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
            const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
            string valueName = "values"+intToString(i+1);
            extraArgs << ", __global " << buffer.getType() << "* global_" << valueName << ", __local " << buffer.getType() << "* local_" << valueName;
-            loadLocal1 << "local_" << valueName << "[get_local_id(0)] = " << valueName << "1;\n";
-            loadLocal2 << "local_" << valueName << "[get_local_id(0)] = global_" << valueName << "[j];\n";
+            loadLocal1 << "local_" << valueName << "[localAtomIndex] = " << valueName << "1;\n";
+            loadLocal2 << "local_" << valueName << "[localAtomIndex] = global_" << valueName << "[j];\n";
            load1 << buffer.getType() << " " << valueName << "1 = global_" << valueName << "[atom1];\n";
            load2 << buffer.getType() << " " << valueName << "2 = local_" << valueName << "[atom2];\n";
        }
@@ -2124,7 +2131,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
            const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
            string index = intToString(i+1);
            extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index;
-            clearLocal << "local_deriv" << index << "[get_local_id(0)] = 0.0f;\n";
+            clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
            load1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
            load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
            recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
@@ -2157,7 +2164,13 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
        defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
        defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
-        string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBEnergyN2_nvidia : OpenCLKernelSources::customGBEnergyN2_default);
+        string file;
+        if (deviceIsCpu)
+            file = OpenCLKernelSources::customGBEnergyN2_cpu;
+        else if (cl.getSIMDWidth() == 32)
+            file = OpenCLKernelSources::customGBEnergyN2_nvidia;
+        else
+            file = OpenCLKernelSources::customGBEnergyN2_default;
        cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
        pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
    }
@@ -2408,6 +2421,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
 }

 double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
@@ -2422,14 +2436,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
        pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
        pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
-        pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
-        pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
+        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float), NULL);
+        pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float), NULL);
        if (nb.getUseCutoff()) {
            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
            index += 2; // Periodic box size arguments are set when the kernel is executed.
            pairValueKernel.setArg<cl_uint>(index++, maxTiles);
-            if (cl.getSIMDWidth() == 32)
+            if (cl.getSIMDWidth() == 32 || deviceIsCpu)
                pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
        }
        else
@@ -2465,19 +2479,19 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        index = 0;
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getForceBuffers().getDeviceBuffer());
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
-        pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
+        pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float4), NULL);
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
-        pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
+        pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float4), NULL);
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
        pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
-        pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
+        pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float4), NULL);
        if (nb.getUseCutoff()) {
            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
            index += 2; // Periodic box size arguments are set when the kernel is executed.
            pairEnergyKernel.setArg<cl_uint>(index++, maxTiles);
-            if (cl.getSIMDWidth() == 32)
+            if (cl.getSIMDWidth() == 32 || deviceIsCpu)
                pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
        }
        else
@@ -2487,17 +2501,17 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
            const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
            pairEnergyKernel.setArg<cl::Memory>(index++, buffer.getMemory());
-            pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*buffer.getSize(), NULL);
+            pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*buffer.getSize(), NULL);
        }
        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
            const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
            pairEnergyKernel.setArg<cl::Memory>(index++, buffer.getMemory());
-            pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*buffer.getSize(), NULL);
+            pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*buffer.getSize(), NULL);
        }
        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
            const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
            pairEnergyKernel.setArg<cl::Memory>(index++, buffer.getMemory());
-            pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*buffer.getSize(), NULL);
+            pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*buffer.getSize(), NULL);
        }
        if (tabulatedFunctionParams != NULL) {
            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
@@ -2560,9 +2574,9 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        }
    }
    int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2;
-    cl.executeKernel(pairValueKernel, numTiles*OpenCLContext::TileSize);
+    cl.executeKernel(pairValueKernel, numTiles*OpenCLContext::TileSize, deviceIsCpu ? 1 : -1);
    cl.executeKernel(perParticleValueKernel, cl.getPaddedNumAtoms());
-    cl.executeKernel(pairEnergyKernel, numTiles*OpenCLContext::TileSize);
+    cl.executeKernel(pairEnergyKernel, numTiles*OpenCLContext::TileSize, deviceIsCpu ? 1 : -1);
    cl.executeKernel(perParticleEnergyKernel, cl.getPaddedNumAtoms());
    if (needParameterGradient)
        cl.executeKernel(gradientChainRuleKernel, cl.getPaddedNumAtoms());

--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+#define TILE_SIZE 32
+#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1;
+#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[tgx];
+
+/**
+ * Compute a force based on pair interactions.
+ */
+
+__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
+	__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
+        __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
+#ifdef USE_CUTOFF
+        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+#else
+    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
+#endif
+    float energy = 0.0f;
+    unsigned int lasty = 0xFFFFFFFF;
+
+    while (pos < end) {
+        // Extract the coordinates of this tile
+        unsigned int x, y;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            y = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y++;
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+        }
+
+        // Locate the exclusion data for this tile.
+
+#ifdef USE_EXCLUSIONS
+        unsigned int exclusionStart = exclusionRowIndices[x];
+        unsigned int exclusionEnd = exclusionRowIndices[x+1];
+        int exclusionIndex = -1;
+        for (int i = exclusionStart; i < exclusionEnd; i++)
+            if (exclusionIndices[i] == y) {
+                exclusionIndex = i*TILE_SIZE;
+                break;
+            }
+        bool hasExclusions = (exclusionIndex > -1);
+#else
+        bool hasExclusions = false;
+#endif
+
+        // Load the data for this tile if we don't already have it cached.
+
+        if (lasty != y) {
+            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+                unsigned int j = y*TILE_SIZE + localAtomIndex;
+                local_posq[localAtomIndex] = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            }
+        }
+        if (x == y) {
+            // This tile is on the diagonal.
+
+            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[exclusionIndex+tgx];
+#endif
+                unsigned int atom1 = x*TILE_SIZE+tgx;
+                float4 force = 0.0f;
+                float4 posq1 = posq[atom1];
+                LOAD_ATOM1_PARAMETERS
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    float4 posq2 = local_posq[j];
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    float r2 = dot(delta.xyz, delta.xyz);
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                    float r = SQRT(r2);
+                    unsigned int atom2 = j;
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    float dEdR = 0.0f;
+                    float tempEnergy = 0.0f;
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                        COMPUTE_INTERACTION
+                        dEdR /= -r;
+                    }
+                    energy += 0.5f*tempEnergy;
+                    delta.xyz *= dEdR;
+                    force.xyz -= delta.xyz;
+#ifdef USE_CUTOFF
+                    }
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
+
+                // Write results
+
+                unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                forceBuffers[offset1].xyz += force.xyz;
+                STORE_DERIVATIVES_1
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+
+            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+                local_force[localAtomIndex] = 0.0f;
+                CLEAR_LOCAL_DERIVATIVES
+            }
+#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
+            unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
+            unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
+            if (!hasExclusions && (flags1 != 0xFFFFFFFF || flags2 != 0xFFFFFFFF)) {
+                // Compute only a subset of the interactions in this tile.
+
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    if ((flags2&(1<<tgx)) != 0) {
+                        unsigned int atom1 = x*TILE_SIZE+tgx;
+                        float value = 0.0f;
+                        float4 posq1 = posq[atom1];
+                        LOAD_ATOM1_PARAMETERS
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                float4 posq2 = local_posq[j];
+                                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                                float r2 = dot(delta.xyz, delta.xyz);
+                                if (r2 < CUTOFF_SQUARED) {
+                                    float r = SQRT(r2);
+                                    unsigned int atom2 = j;
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    float dEdR = 0.0f;
+                                    float tempEnergy = 0.0f;
+                                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                                        COMPUTE_INTERACTION
+                                        dEdR /= -r;
+                                    }
+                                    energy += tempEnergy;
+                                    delta.xyz *= dEdR;
+                                    force.xyz -= delta.xyz;
+                                    atom2 = j;
+                                    local_force[atom2].xyz += delta.xyz;
+                                    RECORD_DERIVATIVE_2
+                                }
+                            }
+                        }
+
+                        // Write results for atom1.
+
+                        unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_value[offset] += value;
+                    }
+                }
+            }
+            else
+#endif
+            {
+                // Compute the full set of interactions in this tile.
+
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
+                    float4 force = 0.0f;
+                    float4 posq1 = posq[atom1];
+                    LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
+#endif
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(excl & 0x1);
+#endif
+                        float4 posq2 = local_posq[j];
+                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                        float r2 = dot(delta.xyz, delta.xyz);
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                        float r = SQRT(r2);
+                        unsigned int atom2 = j;
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+j;
+                        float dEdR = 0.0f;
+                        float tempEnergy = 0.0f;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta.xyz *= dEdR;
+                        force.xyz -= delta.xyz;
+                        atom2 = j;
+                        local_force[atom2].xyz += delta.xyz;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                        }
+#endif
+#ifdef USE_EXCLUSIONS
+                        excl >>= 1;
+#endif
+                    }
+
+                    // Write results for atom1.
+
+                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    forceBuffers[offset1].xyz += force.xyz;
+                    STORE_DERIVATIVES_1
+                }
+            }
+
+            // Write results
+
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                forceBuffers[offset2].xyz += local_force[tgx].xyz;
+                STORE_DERIVATIVES_2
+            }
+        }
+        lasty = y;
+        pos++;
+    }
+    energyBuffer[get_global_id(0)] += energy;
+}
--- a/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
@@ -74,7 +74,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
        if (x == y) {
            // This tile is on the diagonal.

-            local_posq[get_local_id(0)] = posq1;
+            const unsigned int localAtomIndex = get_local_id(0);
+            local_posq[localAtomIndex] = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
            barrier(CLK_LOCAL_MEM_FENCE);
 #ifdef USE_EXCLUSIONS
@@ -136,12 +137,13 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
        else {
            // This is an off-diagonal tile.

+            const unsigned int localAtomIndex = get_local_id(0);
            if (lasty != y && get_local_id(0) < TILE_SIZE) {
                unsigned int j = y*TILE_SIZE + tgx;
-                local_posq[get_local_id(0)] = posq[j];
+                local_posq[localAtomIndex] = posq[j];
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            }
-            local_force[get_local_id(0)] = 0.0f;
+            local_force[localAtomIndex] = 0.0f;
            CLEAR_LOCAL_DERIVATIVES
            barrier(CLK_LOCAL_MEM_FENCE);


--- a/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
@@ -75,7 +75,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
        if (x == y) {
            // This tile is on the diagonal.

-            local_posq[get_local_id(0)] = posq1;
+            const unsigned int localAtomIndex = get_local_id(0);
+            local_posq[localAtomIndex] = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
 #ifdef USE_EXCLUSIONS
            unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
@@ -128,12 +129,13 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
        else {
            // This is an off-diagonal tile.

+            const unsigned int localAtomIndex = get_local_id(0);
            if (lasty != y) {
                unsigned int j = y*TILE_SIZE + tgx;
-                local_posq[get_local_id(0)] = posq[j];
+                local_posq[localAtomIndex] = posq[j];
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            }
-            local_force[get_local_id(0)] = 0.0f;
+            local_force[localAtomIndex] = 0.0f;
            CLEAR_LOCAL_DERIVATIVES
 #ifdef USE_CUTOFF
            unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);

--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+#define TILE_SIZE 32
+
+/**
+ * Compute a value based on pair interactions.
+ */
+
+__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
+        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
+        __local float* tempBuffer,
+#ifdef USE_CUTOFF
+        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+    unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
+#else
+    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
+#endif
+    unsigned int lasty = 0xFFFFFFFF;
+
+    while (pos < end) {
+        // Extract the coordinates of this tile
+        unsigned int x, y;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            y = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y++;
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+        }
+
+        // Locate the exclusion data for this tile.
+
+#ifdef USE_EXCLUSIONS
+        unsigned int exclusionStart = exclusionRowIndices[x];
+        unsigned int exclusionEnd = exclusionRowIndices[x+1];
+        int exclusionIndex = -1;
+        for (int i = exclusionStart; i < exclusionEnd; i++)
+            if (exclusionIndices[i] == y) {
+                exclusionIndex = i*TILE_SIZE;
+                break;
+            }
+        bool hasExclusions = (exclusionIndex > -1);
+#else
+        bool hasExclusions = false;
+#endif
+
+        // Load the data for this tile if we don't already have it cached.
+
+        if (lasty != y) {
+            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
+                unsigned int j = y*TILE_SIZE + localAtomIndex;
+                local_posq[localAtomIndex] = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            }
+        }
+        if (x == y) {
+            // This tile is on the diagonal.
+
+            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[exclusionIndex+tgx];
+#endif
+                unsigned int atom1 = x*TILE_SIZE+tgx;
+                float value = 0.0f;
+                float4 posq1 = posq[atom1];
+                LOAD_ATOM1_PARAMETERS
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    float4 posq2 = local_posq[j];
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                    float r2 = dot(delta.xyz, delta.xyz);
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                    float r = SQRT(r2);
+                    unsigned int atom2 = j;
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    float tempValue1 = 0.0f;
+                    float tempValue2 = 0.0f;
+#ifdef USE_EXCLUSIONS
+                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#else
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#endif
+                        COMPUTE_VALUE
+                    }
+                    value += tempValue1;
+#ifdef USE_CUTOFF
+                    }
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
+
+                // Write results
+
+                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                global_value[offset] += value;
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++)
+                local_value[tgx] = 0.0f;
+#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
+            unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
+            unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
+            if (!hasExclusions && (flags1 != 0xFFFFFFFF || flags2 != 0xFFFFFFFF)) {
+                // Compute only a subset of the interactions in this tile.
+
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    if ((flags2&(1<<tgx)) != 0) {
+                        unsigned int atom1 = x*TILE_SIZE+tgx;
+                        float value = 0.0f;
+                        float4 posq1 = posq[atom1];
+                        LOAD_ATOM1_PARAMETERS
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                float4 posq2 = local_posq[j];
+                                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                                float r2 = dot(delta.xyz, delta.xyz);
+                                float tempValue1 = 0.0f;
+                                float tempValue2 = 0.0f;
+                                if (r2 < CUTOFF_SQUARED) {
+                                    float r = SQRT(r2);
+                                    unsigned int atom2 = j;
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                                        COMPUTE_VALUE
+                                    }
+                                    value += tempValue1;
+                                    local_value[j] += tempValue2;
+                                }
+                            }
+                        }
+
+                        // Write results for atom1.
+
+                        unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                        global_value[offset] += value;
+                    }
+                }
+            }
+            else
+#endif
+            {
+                // Compute the full set of interactions in this tile.
+
+                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                    unsigned int atom1 = x*TILE_SIZE+tgx;
+                    float value = 0.0f;
+                    float4 posq1 = posq[atom1];
+                    LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
+#endif
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(excl & 0x1);
+#endif
+                        float4 posq2 = local_posq[j];
+                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+#endif
+                        float r2 = dot(delta.xyz, delta.xyz);
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                        float r = SQRT(r2);
+                        unsigned int atom2 = j;
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+j;
+                        float tempValue1 = 0.0f;
+                        float tempValue2 = 0.0f;
+#ifdef USE_EXCLUSIONS
+                        if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#else
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
+                            COMPUTE_VALUE
+                        }
+                        value += tempValue1;
+                        local_value[j] += tempValue2;
+#ifdef USE_CUTOFF
+                        }
+#endif
+#ifdef USE_EXCLUSIONS
+                        excl >>= 1;
+#endif
+                    }
+
+                    // Write results for atom1.
+
+                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    global_value[offset] += value;
+                }
+            }
+
+            // Write results
+
+            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
+                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                global_value[offset] += local_value[tgx];
+            }
+        }
+        lasty = y;
+        pos++;
+    }
+}
--- a/platforms/opencl/src/kernels/customGBValueN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_default.cl
@@ -71,7 +71,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
        if (x == y) {
            // This tile is on the diagonal.

-            local_posq[get_local_id(0)] = posq1;
+            const unsigned int localAtomIndex = get_local_id(0);
+            local_posq[localAtomIndex] = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
            barrier(CLK_LOCAL_MEM_FENCE);
 #ifdef USE_EXCLUSIONS
@@ -134,6 +135,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
            if (lasty != y && get_local_id(0) < TILE_SIZE) {
                unsigned int j = y*TILE_SIZE + tgx;
                local_posq[get_local_id(0)] = posq[j];
+                const unsigned int localAtomIndex = get_local_id(0);
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            }
            local_value[get_local_id(0)] = 0.0f;

--- a/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
@@ -73,7 +73,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
        if (x == y) {
            // This tile is on the diagonal.

-            local_posq[get_local_id(0)] = posq1;
+            const unsigned int localAtomIndex = get_local_id(0);
+            local_posq[localAtomIndex] = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
 #ifdef USE_EXCLUSIONS
            unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
@@ -129,6 +130,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
            if (lasty != y) {
                unsigned int j = y*TILE_SIZE + tgx;
                local_posq[get_local_id(0)] = posq[j];
+                const unsigned int localAtomIndex = get_local_id(0);
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            }
            local_value[get_local_id(0)] = 0.0f;

--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -14,8 +14,7 @@ typedef struct {
 * Compute the Born sum.
 */

-__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
-void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
+__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
 #ifdef USE_CUTOFF
        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
 #else
@@ -171,8 +170,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
               // Write results for atom1.

                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                global_bornSum[offset] += localData[tgx].bornSum;
-            }
+                global_bornSum[offset] += bornSum;
            }

            // Write results
@@ -181,6 +179,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                global_bornSum[offset] += localData[tgx].bornSum;
            }
+        }
        lasty = y;
        pos++;
    }
@@ -190,8 +189,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
 * First part of computing the GBSA interaction.
 */

-__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
-void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
+__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
        __global float4* posq, __global float* global_bornRadii,
        __global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer,
 #ifdef USE_CUTOFF
@@ -344,7 +342,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff

                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
-            }
+                global_bornForce[offset] += force.w;
            }

            // Write results
@@ -358,6 +356,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
                forceBuffers[offset] = f;
                global_bornForce[offset] += localData[tgx].fw;
            }
+        }
        lasty = y;
        pos++;
    }