Tony's optimizations to reduce local memory use

fed50628 · Peter Eastman · 2dd09317 · fed50628 · fed50628 · fed50628
Commit fed50628 authored Jan 27, 2012 by Peter Eastman
8 changed files
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1563,7 +1563,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        computeBornSumKernel.setArg<cl::Buffer>(index++, (useLong ? longBornSum->getDeviceBuffer() : bornSum->getDeviceBuffer()));
        computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
-        computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*7*sizeof(cl_float), NULL);
        if (nb.getUseCutoff()) {
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1585,7 +1584,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        force1Kernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
        force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
-        force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*9*sizeof(cl_float), NULL);
        if (nb.getUseCutoff()) {
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1624,19 +1622,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer());
    }
    if (nb.getUseCutoff()) {
-        computeBornSumKernel.setArg<mm_float4>(6, cl.getPeriodicBoxSize());
-        computeBornSumKernel.setArg<mm_float4>(7, cl.getInvPeriodicBoxSize());
-        force1Kernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
-        force1Kernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
+        computeBornSumKernel.setArg<mm_float4>(5, cl.getPeriodicBoxSize());
+        computeBornSumKernel.setArg<mm_float4>(6, cl.getInvPeriodicBoxSize());
+        force1Kernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
+        force1Kernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
        if (maxTiles < nb.getInteractingTiles().getSize()) {
            maxTiles = nb.getInteractingTiles().getSize();
-            computeBornSumKernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
-            computeBornSumKernel.setArg<cl_uint>(8, maxTiles);
-            force1Kernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
-            force1Kernel.setArg<cl_uint>(10, maxTiles);
+            computeBornSumKernel.setArg<cl::Buffer>(4, nb.getInteractingTiles().getDeviceBuffer());
+            computeBornSumKernel.setArg<cl_uint>(7, maxTiles);
+            force1Kernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
+            force1Kernel.setArg<cl_uint>(9, maxTiles);
            if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
-                computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionFlags().getDeviceBuffer());
-                force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer());
+                computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer());
+                force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
            }
        }
    }

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
 void OpenCLNonbondedUtilities::computeInteractions() {
    if (cutoff != -1.0) {
        if (useCutoff) {
-            forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize());
-            forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize());
+            forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize());
+            forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize());
        }
        context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
@@ -349,14 +349,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
        newSize = numTiles;
    delete interactingTiles;
    interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
-    forceKernel.setArg<cl::Buffer>(9, interactingTiles->getDeviceBuffer());
-    forceKernel.setArg<cl_uint>(13, newSize);
+    forceKernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
+    forceKernel.setArg<cl_uint>(12, newSize);
    findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
    findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
    if (context.getSIMDWidth() == 32 || deviceIsCpu) {
        delete interactionFlags;
        interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
-        forceKernel.setArg<cl::Buffer>(14, interactionFlags->getDeviceBuffer());
+        forceKernel.setArg<cl::Buffer>(13, interactionFlags->getDeviceBuffer());
        findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
        findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
        findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
@@ -369,22 +369,22 @@ void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
    this->numTiles = numTiles;
    if (cutoff == -1.0)
        return; // There are no nonbonded interactions in the System.
-    forceKernel.setArg<cl_uint>(7, startTileIndex);
-    forceKernel.setArg<cl_uint>(8, startTileIndex+numTiles);
+    forceKernel.setArg<cl_uint>(6, startTileIndex);
+    forceKernel.setArg<cl_uint>(7, startTileIndex+numTiles);
    if (useCutoff) {
        findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
        findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
    }
    else
-        forceKernel.setArg<cl_uint>(9, numTiles);
+        forceKernel.setArg<cl_uint>(8, numTiles);
 }

 cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const {
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
-    int localDataSize = 7*sizeof(cl_float);
    const string suffixes[] = {"x", "y", "z", "w"};
    stringstream localData;
+    int localDataSize = 0;
    for (int i = 0; i < (int) params.size(); i++) {
        if (params[i].getNumComponents() == 1)
            localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
@@ -394,10 +394,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
        }
        localDataSize += params[i].getSize();
    }
-    if ((localDataSize/4)%2 == 0) {
-        localData << "float padding;\n";
-        localDataSize += 4;
-    }
    replacements["ATOM_PARAMETER_DATA"] = localData.str();
    stringstream args;
    for (int i = 0; i < (int) params.size(); i++) {
@@ -487,6 +483,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
+    if ((localDataSize/4)%2 == 0)
+        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    string file;
    if (deviceIsCpu)
        file = OpenCLKernelSources::nonbonded_cpu;
@@ -509,7 +507,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
-    kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize*localDataSize : forceThreadBlockSize*localDataSize), NULL);
    kernel.setArg<cl_uint>(index++, startTileIndex);
    kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
    if (useCutoff) {

--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -12,7 +12,6 @@ typedef struct {
 */

 __kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
-        __local AtomData1* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
 #else
@@ -27,6 +26,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
    unsigned int lasty = 0xFFFFFFFF;
+    __local AtomData1 localData[TILE_SIZE];

    while (pos < end) {
        // Extract the coordinates of this tile
@@ -196,7 +196,6 @@ typedef struct {

 __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
-        __local AtomData2* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
 #else
@@ -212,6 +211,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
+    __local AtomData2 localData[TILE_SIZE];

    while (pos < end) {
        // Extract the coordinates of this tile

--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
@@ -2,9 +2,7 @@

 typedef struct {
    float x, y, z;
-    float q;
    float radius, scaledRadius;
-    float bornSum;
 } AtomData1;

 /**
@@ -13,7 +11,6 @@ typedef struct {

 __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
 void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
-        __local AtomData1* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else
@@ -28,7 +25,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
    unsigned int lasty = 0xFFFFFFFF;
-    __local float tempBuffer[FORCE_WORK_GROUP_SIZE/2];
+    __local AtomData1 localData[TILE_SIZE];
+    __local float localBornSum[FORCE_WORK_GROUP_SIZE];
+    __local float localTemp[TILE_SIZE];

    while (pos < end) {
        // Extract the coordinates of this tile
@@ -51,7 +50,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
        }
        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
+        unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
        unsigned int atom1 = x*TILE_SIZE + tgx;
        float bornSum = 0.0f;
        float4 posq1 = posq[atom1];
@@ -59,12 +58,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
        if (x == y) {
            // This tile is on the diagonal.

-            localData[get_local_id(0)].x = posq1.x;
-            localData[get_local_id(0)].y = posq1.y;
-            localData[get_local_id(0)].z = posq1.z;
-            localData[get_local_id(0)].q = posq1.w;
-            localData[get_local_id(0)].radius = params1.x;
-            localData[get_local_id(0)].scaledRadius = params1.y;
+            if (get_local_id(0) < TILE_SIZE) {
+                localData[get_local_id(0)].x = posq1.x;
+                localData[get_local_id(0)].y = posq1.y;
+                localData[get_local_id(0)].z = posq1.z;
+                localData[get_local_id(0)].radius = params1.x;
+                localData[get_local_id(0)].scaledRadius = params1.y;
+            }
            barrier(CLK_LOCAL_MEM_FENCE);
            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
                float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
@@ -96,7 +96,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
            // Sum the forces and write results.

            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[tgx] = bornSum;
+                localTemp[tgx] = bornSum;
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
@@ -104,8 +104,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
 #else
                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
-                global_bornSum[offset] += bornSum+tempBuffer[tgx];
+                global_bornSum[offset] += bornSum+localTemp[tgx];
            }
+            // barrier not required here as localTemp is not accessed before encountering another barrier.
        }
        else {
            // This is an off-diagonal tile.
@@ -116,19 +117,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
                localData[get_local_id(0)].x = tempPosq.x;
                localData[get_local_id(0)].y = tempPosq.y;
                localData[get_local_id(0)].z = tempPosq.z;
-                localData[get_local_id(0)].q = tempPosq.w;
                float2 tempParams = global_params[j];
                localData[get_local_id(0)].radius = tempParams.x;
                localData[get_local_id(0)].scaledRadius = tempParams.y;
            }
-            localData[get_local_id(0)].bornSum = 0.0f;
+            localBornSum[get_local_id(0)] = 0.0f;
            barrier(CLK_LOCAL_MEM_FENCE);

            // Compute the full set of interactions in this tile.

-            unsigned int tj = tgx%(TILE_SIZE/2);
+            unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-                float4 delta = (float4) (localData[baseLocalAtom+tj].x-posq1.x, localData[baseLocalAtom+tj].y-posq1.y, localData[baseLocalAtom+tj].z-posq1.z, 0.0f);
+                float4 delta = (float4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0.0f);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
@@ -136,13 +136,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
 #endif
                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
+                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
 #else
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS);
+                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
 #endif
                float invR = RSQRT(r2);
                float r = RECIP(invR);
-                float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius);
+                float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
                float rScaledRadiusJ = r+params2.y;
                {
                    float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
@@ -165,16 +165,16 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
                    float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                     (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
                    term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r);
-                    localData[baseLocalAtom+tj+forceBufferOffset].bornSum += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
+                    localBornSum[tj+localForceOffset] += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
                }
                barrier(CLK_LOCAL_MEM_FENCE);
-                tj = (tj+1)%(TILE_SIZE/2);
+                tj = (tj+1) & (TILE_SIZE-1);
            }

            // Sum the forces and write results.

            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[tgx] = bornSum;
+                localTemp[tgx] = bornSum;
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
@@ -187,22 +187,28 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
                // Do both loads before both stores to minimize store-load waits.
                float sum1 = global_bornSum[offset1];
                float sum2 = global_bornSum[offset2];
-                sum1 += bornSum + tempBuffer[tgx];
-                sum2 += localData[get_local_id(0)].bornSum + localData[get_local_id(0)+TILE_SIZE].bornSum;
+                sum1 += bornSum + localTemp[tgx];
+                sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
                global_bornSum[offset1] = sum1;
                global_bornSum[offset2] = sum2;
            }
+            barrier(CLK_LOCAL_MEM_FENCE);
        }
        lasty = y;
        pos++;
    }
 }

+typedef struct {
+    float x, y, z, w;
+    float padding;
+} PaddedUnalignedFloat4;
+
 typedef struct {
    float x, y, z;
    float q;
-    float fx, fy, fz, fw;
    float bornRadius;
+    float temp_x, temp_y, temp_z, temp_w;
 } AtomData2;

 /**
@@ -212,7 +218,6 @@ typedef struct {
 __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
 void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
-        __local AtomData2* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else
@@ -228,7 +233,8 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
-    __local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2];
+    __local AtomData2 localData[TILE_SIZE];
+    __local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];

    while (pos < end) {
        // Extract the coordinates of this tile
@@ -251,7 +257,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
        }
        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
+        unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
        unsigned int atom1 = x*TILE_SIZE + tgx;
        float4 force = 0.0f;
        float4 posq1 = posq[atom1];
@@ -259,11 +265,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
        if (x == y) {
            // This tile is on the diagonal.

-            localData[get_local_id(0)].x = posq1.x;
-            localData[get_local_id(0)].y = posq1.y;
-            localData[get_local_id(0)].z = posq1.z;
-            localData[get_local_id(0)].q = posq1.w;
-            localData[get_local_id(0)].bornRadius = bornRadius1;
+            if (get_local_id(0) < TILE_SIZE) {
+                localData[get_local_id(0)].x = posq1.x;
+                localData[get_local_id(0)].y = posq1.y;
+                localData[get_local_id(0)].z = posq1.z;
+                localData[get_local_id(0)].q = posq1.w;
+                localData[get_local_id(0)].bornRadius = bornRadius1;
+            }
            barrier(CLK_LOCAL_MEM_FENCE);
            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS);
@@ -300,8 +308,12 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r

            // Sum the forces and write results.

-            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[tgx] = force;
+            if (get_local_id(0) >= TILE_SIZE) {
+                localData[tgx].temp_x = force.x;
+                localData[tgx].temp_y = force.y;
+                localData[tgx].temp_z = force.z;
+                localData[tgx].temp_w = force.w;
+            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
@@ -312,11 +324,14 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
                // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
                float4 sum = forceBuffers[offset];
                float global_sum = global_bornForce[offset];
-                sum.xyz += force.xyz + tempBuffer[tgx].xyz;
-                global_sum += force.w + tempBuffer[tgx].w;
+                sum.x += force.x + localData[tgx].temp_x;
+                sum.y += force.y + localData[tgx].temp_y;
+                sum.z += force.z + localData[tgx].temp_z;
+                global_sum += force.w + localData[tgx].temp_w;
                forceBuffers[offset] = sum;
                global_bornForce[offset] = global_sum;
            }
+            // barrier not required here as localData[*]/temp_* is not accessed before encountering another barrier.
        }
        else {
            // This is an off-diagonal tile.
@@ -330,18 +345,18 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
                localData[get_local_id(0)].q = tempPosq.w;
                localData[get_local_id(0)].bornRadius = global_bornRadii[j];
            }
-            localData[get_local_id(0)].fx = 0.0f;
-            localData[get_local_id(0)].fy = 0.0f;
-            localData[get_local_id(0)].fz = 0.0f;
-            localData[get_local_id(0)].fw = 0.0f;
+            localForce[get_local_id(0)].x = 0.0f;
+            localForce[get_local_id(0)].y = 0.0f;
+            localForce[get_local_id(0)].z = 0.0f;
+            localForce[get_local_id(0)].w = 0.0f;
            barrier(CLK_LOCAL_MEM_FENCE);

            // Compute the full set of interactions in this tile.

-            unsigned int tj = tgx%(TILE_SIZE/2);
+            unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
-                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS);
-                float4 posq2 = (float4) (localData[baseLocalAtom+tj].x, localData[baseLocalAtom+tj].y, localData[baseLocalAtom+tj].z, localData[baseLocalAtom+tj].q);
+                unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
+                float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
@@ -351,7 +366,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                float invR = RSQRT(r2);
                float r = RECIP(invR);
-                float bornRadius2 = localData[baseLocalAtom+tj].bornRadius;
+                float bornRadius2 = localData[tj].bornRadius;
                float alpha2_ij = bornRadius1*bornRadius2;
                float D_ij = r2*RECIP(4.0f*alpha2_ij);
                float expTerm = EXP(-D_ij);
@@ -370,18 +385,22 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
                energy += select(0.0f, tempEnergy, includeInteraction);
                delta.xyz *= select(0.0f, dEdR, includeInteraction);
                force.xyz -= delta.xyz;
-                localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x;
-                localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y;
-                localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z;
-                localData[baseLocalAtom+tj+forceBufferOffset].fw += select(0.0f, dGpol_dalpha2_ij*bornRadius1, includeInteraction);
+                localForce[tj+localForceOffset].x += delta.x;
+                localForce[tj+localForceOffset].y += delta.y;
+                localForce[tj+localForceOffset].z += delta.z;
+                localForce[tj+localForceOffset].w += select(0.0f, dGpol_dalpha2_ij*bornRadius1, includeInteraction);
                barrier(CLK_LOCAL_MEM_FENCE);
-                tj = (tj+1)%(TILE_SIZE/2);
+                tj = (tj+1) & (TILE_SIZE-1);
            }

            // Sum the forces and write results.

-            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[tgx] = force;
+            if (get_local_id(0) >= TILE_SIZE) {
+                localData[tgx].temp_x = force.x;
+                localData[tgx].temp_y = force.y;
+                localData[tgx].temp_z = force.z;
+                localData[tgx].temp_w = force.w;
+            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
@@ -396,17 +415,20 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
                float4 sum2 = forceBuffers[offset2];
                float global_sum1 = global_bornForce[offset1];
                float global_sum2 = global_bornForce[offset2];
-                sum1.xyz += force.xyz + tempBuffer[tgx].xyz;
-                global_sum1 += force.w + tempBuffer[tgx].w;
-                sum2.x += localData[get_local_id(0)].fx + localData[get_local_id(0)+TILE_SIZE].fx;
-                sum2.y += localData[get_local_id(0)].fy + localData[get_local_id(0)+TILE_SIZE].fy;
-                sum2.z += localData[get_local_id(0)].fz + localData[get_local_id(0)+TILE_SIZE].fz;
-                global_sum2 += localData[get_local_id(0)].fw + localData[get_local_id(0)+TILE_SIZE].fw;
+                sum1.x += force.x + localData[tgx].temp_x;
+                sum1.y += force.y + localData[tgx].temp_y;
+                sum1.z += force.z + localData[tgx].temp_z;
+                global_sum1 += force.w + localData[tgx].temp_w;
+                sum2.x += localForce[get_local_id(0)].x + localForce[get_local_id(0)+TILE_SIZE].x;
+                sum2.y += localForce[get_local_id(0)].y + localForce[get_local_id(0)+TILE_SIZE].y;
+                sum2.z += localForce[get_local_id(0)].z + localForce[get_local_id(0)+TILE_SIZE].z;
+                global_sum2 += localForce[get_local_id(0)].w + localForce[get_local_id(0)+TILE_SIZE].w;
                forceBuffers[offset1] = sum1;
                forceBuffers[offset2] = sum2;
                global_bornForce[offset1] = global_sum1;
                global_bornForce[offset2] = global_sum2;
            }
+            barrier(CLK_LOCAL_MEM_FENCE);
        }
        lasty = y;
        pos++;

--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
@@ -22,7 +22,6 @@ __kernel void computeBornSum(
        __global float* restrict global_bornSum,
 #endif
        __global const float4* restrict posq, __global const float2* restrict global_params,
-        __local AtomData1* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
 #else
@@ -40,6 +39,7 @@ __kernel void computeBornSum(
    unsigned int end = (warp+1)*numTiles/totalWarps;
 #endif
    unsigned int lasty = 0xFFFFFFFF;
+    __local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
    __local float tempBuffer[FORCE_WORK_GROUP_SIZE];
    __local int2 reservedBlocks[WARPS_PER_GROUP];
    __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
@@ -344,7 +344,6 @@ __kernel void computeGBSAForce1(
        __global float4* restrict forceBuffers, __global float* restrict global_bornForce,
 #endif
        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
-        __local AtomData2* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
 #else
@@ -363,6 +362,7 @@ __kernel void computeGBSAForce1(
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
+    __local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
    __local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
    __local int2 reservedBlocks[WARPS_PER_GROUP];
    __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -12,7 +12,7 @@ typedef struct {
 */

 __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
@@ -30,6 +30,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
+    __local AtomData localData[TILE_SIZE];

    while (pos < end) {
        // Extract the coordinates of this tile

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
 #define TILE_SIZE 32

+// Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would
+// cause every element of array to have DWORD of padding to make it 4 DWORD
+// aligned which wastes space and causes LDS bank conflicts as stride is no
+// longer odd DWORDS.
+typedef struct {
+    float x, y, z;
+} UnalignedFloat3;
+
 typedef struct {
    float x, y, z;
    float q;
    float fx, fy, fz;
    ATOM_PARAMETER_DATA
+#ifndef PARAMETER_SIZE_IS_EVEN
+    float padding;
+#endif
 } AtomData;

 /**
@@ -13,7 +24,7 @@ typedef struct {

 __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
 void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
@@ -31,9 +42,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
-    __local float tempBuffer[3*(FORCE_WORK_GROUP_SIZE/2)];
+    __local AtomData localData[TILE_SIZE];
+    __local UnalignedFloat3 localForce[FORCE_WORK_GROUP_SIZE];
+#ifdef USE_EXCLUSIONS
    __local unsigned int exclusionRange[2];
    __local int exclusionIndex[1];
+#endif

    while (pos < end) {
        // Extract the coordinates of this tile
@@ -56,7 +70,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
        }
        unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-        unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
+        unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
        unsigned int atom1 = x*TILE_SIZE + tgx;
        float4 force = 0.0f;
        float4 posq1 = posq[atom1];
@@ -79,12 +93,14 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
        if (x == y) {
            // This tile is on the diagonal.

-            const unsigned int localAtomIndex = get_local_id(0);
-            localData[localAtomIndex].x = posq1.x;
-            localData[localAtomIndex].y = posq1.y;
-            localData[localAtomIndex].z = posq1.z;
-            localData[localAtomIndex].q = posq1.w;
-            LOAD_LOCAL_PARAMETERS_FROM_1
+            if (get_local_id(0) < TILE_SIZE) {
+                const unsigned int localAtomIndex = tgx;
+                localData[localAtomIndex].x = posq1.x;
+                localData[localAtomIndex].y = posq1.y;
+                localData[localAtomIndex].z = posq1.z;
+                localData[localAtomIndex].q = posq1.w;
+                LOAD_LOCAL_PARAMETERS_FROM_1
+            }
            barrier(CLK_LOCAL_MEM_FENCE);
 #ifdef USE_EXCLUSIONS
            unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
@@ -93,7 +109,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
 #ifdef USE_EXCLUSIONS
                bool isExcluded = !(excl & 0x1);
 #endif
-                int atom2 = baseLocalAtom+j;
+                unsigned int atom2 = baseLocalAtom+j;
                float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
@@ -125,14 +141,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re

            // Sum the forces and write results.

-            int bufferIndex = 3*tgx;
            if (get_local_id(0) >= TILE_SIZE) {
-                tempBuffer[bufferIndex] = force.x;
-                tempBuffer[bufferIndex+1] = force.y;
-                tempBuffer[bufferIndex+2] = force.z;
+                localData[tgx].fx = force.x;
+                localData[tgx].fy = force.y;
+                localData[tgx].fz = force.z;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
+                force.x += localData[tgx].fx;
+                force.y += localData[tgx].fy;
+                force.z += localData[tgx].fz;
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
                unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
 #else
@@ -140,15 +158,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
 #endif
                // Cheaper to load/store float4 than float3.
                float4 sum = forceBuffers[offset];
-                sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
+                sum.xyz += force.xyz;
                forceBuffers[offset] = sum;
            }
+            // barrier not required here as localData[*].temp is not accessed before encountering another barrier.
        }
        else {
            // This is an off-diagonal tile.

-            const unsigned int localAtomIndex = get_local_id(0);
-            if (lasty != y && localAtomIndex < TILE_SIZE) {
+            if (lasty != y && get_local_id(0) < TILE_SIZE) {
+                const unsigned int localAtomIndex = tgx;
                unsigned int j = y*TILE_SIZE + tgx;
                float4 tempPosq = posq[j];
                localData[localAtomIndex].x = tempPosq.x;
@@ -157,26 +176,23 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
                localData[localAtomIndex].q = tempPosq.w;
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            }
-            localData[localAtomIndex].fx = 0.0f;
-            localData[localAtomIndex].fy = 0.0f;
-            localData[localAtomIndex].fz = 0.0f;
+            localForce[get_local_id(0)].x = 0.0f;
+            localForce[get_local_id(0)].y = 0.0f;
+            localForce[get_local_id(0)].z = 0.0f;
            barrier(CLK_LOCAL_MEM_FENCE);

            // Compute the full set of interactions in this tile.

+            unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
 #ifdef USE_EXCLUSIONS
            unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
-            excl = (excl >> baseLocalAtom) & 0xFFFF;
-            excl += excl << 16;
-            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+            excl = (excl >> tj) | (excl << (TILE_SIZE - tj));
 #endif
-            unsigned int tj = tgx%(TILE_SIZE/2);
            for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
 #ifdef USE_EXCLUSIONS
                bool isExcluded = !(excl & 0x1);
 #endif
-                int atom2 = baseLocalAtom+tj;
-                float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
@@ -186,8 +202,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                float invR = RSQRT(r2);
                float r = RECIP(invR);
+                int atom2 = tj;
                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+baseLocalAtom+tj;
+                atom2 = y*TILE_SIZE+tj;
 #ifdef USE_SYMMETRIC
                float dEdR = 0.0f;
 #else
@@ -200,29 +217,28 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
 #ifdef USE_SYMMETRIC
                delta.xyz *= dEdR;
                force.xyz -= delta.xyz;
-                localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x;
-                localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y;
-                localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z;
+                localForce[tj+localForceOffset].x += delta.x;
+                localForce[tj+localForceOffset].y += delta.y;
+                localForce[tj+localForceOffset].z += delta.z;
 #else
                force.xyz -= dEdR1.xyz;
-                localData[baseLocalAtom+tj+forceBufferOffset].fx += dEdR2.x;
-                localData[baseLocalAtom+tj+forceBufferOffset].fy += dEdR2.y;
-                localData[baseLocalAtom+tj+forceBufferOffset].fz += dEdR2.z;
+                localForce[tj+localForceOffset].x += dEdR2.x;
+                localForce[tj+localForceOffset].y += dEdR2.y;
+                localForce[tj+localForceOffset].z += dEdR2.z;
 #endif
                barrier(CLK_LOCAL_MEM_FENCE);
 #ifdef USE_EXCLUSIONS
                excl >>= 1;
 #endif
-                tj = (tj+1)%(TILE_SIZE/2);
+                tj = (tj+1) & (TILE_SIZE-1);
            }

            // Sum the forces and write results.

-            int bufferIndex = 3*tgx;
            if (get_local_id(0) >= TILE_SIZE) {
-                tempBuffer[bufferIndex] = force.x;
-                tempBuffer[bufferIndex+1] = force.y;
-                tempBuffer[bufferIndex+2] = force.z;
+                localData[tgx].fx = force.x;
+                localData[tgx].fy = force.y;
+                localData[tgx].fz = force.z;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
@@ -236,11 +252,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
                // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
                float4 sum1 = forceBuffers[offset1];
                float4 sum2 = forceBuffers[offset2];
-                sum1 += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
-                sum2 += (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f);
+                sum1.x += localData[tgx].fx + force.x;
+                sum1.y += localData[tgx].fy + force.y;
+                sum1.z += localData[tgx].fz + force.z;
+                sum2.x += localForce[tgx].x + localForce[tgx+TILE_SIZE].x;
+                sum2.y += localForce[tgx].y + localForce[tgx+TILE_SIZE].y;
+                sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z;
                forceBuffers[offset1] = sum1;
                forceBuffers[offset2] = sum2;
            }
+            barrier(CLK_LOCAL_MEM_FENCE);
        }
        lasty = y;
        pos++;

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
@@ -10,6 +10,9 @@ typedef struct {
    float q;
    float fx, fy, fz;
    ATOM_PARAMETER_DATA
+#ifndef PARAMETER_SIZE_IS_EVEN
+    float padding;
+#endif
 } AtomData;

 /**
@@ -22,7 +25,7 @@ __kernel void computeNonbonded(
        __global float4* restrict forceBuffers,
 #endif
        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
-        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
@@ -41,6 +44,7 @@ __kernel void computeNonbonded(
    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
 #endif
    float energy = 0.0f;
+    __local AtomData localData[FORCE_WORK_GROUP_SIZE];
    __local float tempBuffer[3*FORCE_WORK_GROUP_SIZE];
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];