More changes from Tony to reduce local memory use

ce74c6ae · Peter Eastman · 3f64d970 · ce74c6ae · ce74c6ae · ce74c6ae
Commit ce74c6ae authored Oct 25, 2011 by Peter Eastman
7 changed files
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1517,8 +1517,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
        defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
        defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
-        if (cl.getSIMDWidth() == 32)
-            defines["WARPS_PER_GROUP"] = OpenCLExpressionUtilities::intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
+        defines["FORCE_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(nb.getForceThreadBlockSize());
        string file;
        if (deviceIsCpu)
            file = OpenCLKernelSources::gbsaObc_cpu;
@@ -1534,7 +1533,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
        computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*7*sizeof(cl_float), NULL);
-        computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
        if (nb.getUseCutoff()) {
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1557,7 +1555,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
        force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*9*sizeof(cl_float), NULL);
-        force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL);
        if (nb.getUseCutoff()) {
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
            force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
@@ -1596,19 +1593,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer());
    }
    if (nb.getUseCutoff()) {
-        computeBornSumKernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
-        computeBornSumKernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
-        force1Kernel.setArg<mm_float4>(9, cl.getPeriodicBoxSize());
-        force1Kernel.setArg<mm_float4>(10, cl.getInvPeriodicBoxSize());
+        computeBornSumKernel.setArg<mm_float4>(6, cl.getPeriodicBoxSize());
+        computeBornSumKernel.setArg<mm_float4>(7, cl.getInvPeriodicBoxSize());
+        force1Kernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
+        force1Kernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
        if (maxTiles < nb.getInteractingTiles().getSize()) {
            maxTiles = nb.getInteractingTiles().getSize();
            computeBornSumKernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
-            computeBornSumKernel.setArg<cl_uint>(9, maxTiles);
-            force1Kernel.setArg<cl::Buffer>(7, nb.getInteractingTiles().getDeviceBuffer());
-            force1Kernel.setArg<cl_uint>(11, maxTiles);
+            computeBornSumKernel.setArg<cl_uint>(8, maxTiles);
+            force1Kernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
+            force1Kernel.setArg<cl_uint>(10, maxTiles);
            if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
-                computeBornSumKernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
-                force1Kernel.setArg<cl::Buffer>(12, nb.getInteractionFlags().getDeviceBuffer());
+                computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionFlags().getDeviceBuffer());
+                force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer());
            }
        }
    }

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -480,7 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
        defines["USE_EXCLUSIONS"] = "1";
    if (isSymmetric)
        defines["USE_SYMMETRIC"] = "1";
-    defines["NONBONDED_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize);
+    defines["FORCE_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize);
    defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff);
    defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());

--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -15,7 +15,7 @@ typedef struct {
 */

 __kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
-        __local AtomData* restrict localData, __local float* restrict tempBuffer,
+        __local AtomData* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
 #else
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c

 __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer,
        __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce,
-        __local AtomData* restrict localData, __local float4* restrict tempBuffer,
+        __local AtomData* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
 #else

--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
@@ -11,9 +11,9 @@ typedef struct {
 * Compute the Born sum.
 */

-__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
+__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
 void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
-        __local AtomData1* restrict localData, __local float* restrict tempBuffer,
+        __local AtomData1* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else
@@ -28,6 +28,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
    unsigned int lasty = 0xFFFFFFFF;
+    __local float tempBuffer[FORCE_WORK_GROUP_SIZE/2];

    while (pos < end) {
        // Extract the coordinates of this tile
@@ -94,9 +95,8 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa

            // Sum the forces and write results.

-
            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[get_local_id(0)] = bornSum;
+                tempBuffer[tgx] = bornSum;
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
@@ -104,7 +104,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
 #else
                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
-                global_bornSum[offset] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
+                global_bornSum[offset] += bornSum+tempBuffer[tgx];
            }
        }
        else {
@@ -174,7 +174,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
            // Sum the forces and write results.

            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[get_local_id(0)] = bornSum;
+                tempBuffer[tgx] = bornSum;
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
@@ -184,8 +184,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
                unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
-                global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
-                global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum;
+                // Do both loads before both stores to minimize store-load waits.
+                float sum1 = global_bornSum[offset1];
+                float sum2 = global_bornSum[offset2];
+                sum1 += bornSum + tempBuffer[tgx];
+                sum2 += localData[get_local_id(0)].bornSum + localData[get_local_id(0)+TILE_SIZE].bornSum;
+                global_bornSum[offset1] = sum1;
+                global_bornSum[offset2] = sum2;
            }
        }
        lasty = y;
@@ -204,10 +209,10 @@ typedef struct {
 * First part of computing the GBSA interaction.
 */

-__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
+__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
 void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
-        __local AtomData2* restrict localData, __local float4* restrict tempBuffer,
+        __local AtomData2* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else
@@ -223,6 +228,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
+    __local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2];

    while (pos < end) {
        // Extract the coordinates of this tile
@@ -295,7 +301,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
            // Sum the forces and write results.

            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[get_local_id(0)] = force;
+                tempBuffer[tgx] = force;
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
@@ -303,8 +309,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
 #else
                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
-                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
-                global_bornForce[offset] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
+                // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
+                float4 sum = forceBuffers[offset];
+                float global_sum = global_bornForce[offset];
+                sum.xyz += force.xyz + tempBuffer[tgx].xyz;
+                global_sum += force.w + tempBuffer[tgx].w;
+                forceBuffers[offset] = sum;
+                global_bornForce[offset] = global_sum;
            }
        }
        else {
@@ -370,7 +381,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
            // Sum the forces and write results.

            if (get_local_id(0) >= TILE_SIZE)
-                tempBuffer[get_local_id(0)] = force;
+                tempBuffer[tgx] = force;
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < TILE_SIZE) {
 #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
@@ -380,14 +391,21 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
                unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
-                forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
-                float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx,
-                                       localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy,
-                                       localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz,
-                                       localData[get_local_id(0)].fw+localData[get_local_id(0)+TILE_SIZE].fw);
-                forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+sum.xyz;
-                global_bornForce[offset1] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
-                global_bornForce[offset2] += sum.w;
+                // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
+                float4 sum1 = forceBuffers[offset1];
+                float4 sum2 = forceBuffers[offset2];
+                float global_sum1 = global_bornForce[offset2];
+                float global_sum2 = global_bornForce[offset2];
+                sum1.xyz += force.xyz + tempBuffer[tgx].xyz;
+                global_sum1 += force.w + tempBuffer[tgx].w;
+                sum2.x += localData[get_local_id(0)].fx + localData[get_local_id(0)+TILE_SIZE].fx;
+                sum2.y += localData[get_local_id(0)].fy + localData[get_local_id(0)+TILE_SIZE].fy;
+                sum2.z += localData[get_local_id(0)].fz + localData[get_local_id(0)+TILE_SIZE].fz;
+                global_sum2 += localData[get_local_id(0)].fw + localData[get_local_id(0)+TILE_SIZE].fw;
+                forceBuffers[offset1] = sum1;
+                forceBuffers[offset2] = sum2;
+                global_bornForce[offset1] = global_sum1;
+                global_bornForce[offset2] = global_sum2;
            }
        }
        lasty = y;

--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
@@ -3,6 +3,7 @@
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
 #endif
 #define TILE_SIZE 32
+#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)

 typedef struct {
    float x, y, z;
@@ -21,7 +22,7 @@ __kernel void computeBornSum(
        __global float* restrict global_bornSum,
 #endif
        __global const float4* restrict posq, __global const float2* restrict global_params,
-        __local AtomData1* restrict localData, __local float* restrict tempBuffer,
+        __local AtomData1* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
 #else
@@ -39,6 +40,7 @@ __kernel void computeBornSum(
    unsigned int end = (warp+1)*numTiles/totalWarps;
 #endif
    unsigned int lasty = 0xFFFFFFFF;
+    __local float tempBuffer[FORCE_WORK_GROUP_SIZE];
    __local int2 reservedBlocks[WARPS_PER_GROUP];
    __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
    __local int exclusionIndex[WARPS_PER_GROUP];
@@ -342,7 +344,7 @@ __kernel void computeGBSAForce1(
        __global float4* restrict forceBuffers, __global float* restrict global_bornForce,
 #endif
        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
-        __local AtomData2* restrict localData, __local float4* restrict tempBuffer,
+        __local AtomData2* restrict localData,
 #ifdef USE_CUTOFF
        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
 #else
@@ -361,6 +363,7 @@ __kernel void computeGBSAForce1(
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
+    __local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
    __local int2 reservedBlocks[WARPS_PER_GROUP];
    __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
    __local int exclusionIndex[WARPS_PER_GROUP];

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
@@ -11,7 +11,7 @@ typedef struct {
 * Compute nonbonded interactions.
 */

-__kernel __attribute__((reqd_work_group_size(NONBONDED_WORK_GROUP_SIZE, 1, 1)))
+__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
 void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
        unsigned int startTileIndex, unsigned int endTileIndex,
@@ -31,7 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
 #endif
    float energy = 0.0f;
    unsigned int lasty = 0xFFFFFFFF;
-    __local float tempBuffer[3*(NONBONDED_WORK_GROUP_SIZE/2)];
+    __local float tempBuffer[3*(FORCE_WORK_GROUP_SIZE/2)];
    __local unsigned int exclusionRange[2];
    __local int exclusionIndex[1];

@@ -138,6 +138,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
 #else
                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
+                // Cheaper to load/store float4 than float3.
                float4 sum = forceBuffers[offset];
                sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
                forceBuffers[offset] = sum;
@@ -232,7 +233,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
                unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
-                // Cheaper to load/store float4 than float3. Do both loads before both stores to minimize store-load waits.
+                // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
                float4 sum1 = forceBuffers[offset1];
                float4 sum2 = forceBuffers[offset2];
                sum1 += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
@@ -3,7 +3,7 @@
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
 #endif
 #define TILE_SIZE 32
-#define WARPS_PER_GROUP (NONBONDED_WORK_GROUP_SIZE/TILE_SIZE)
+#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)

 typedef struct {
    float x, y, z;
@@ -41,7 +41,7 @@ __kernel void computeNonbonded(
    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
 #endif
    float energy = 0.0f;
-    __local float tempBuffer[3*NONBONDED_WORK_GROUP_SIZE];
+    __local float tempBuffer[3*FORCE_WORK_GROUP_SIZE];
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];
    __local int2* reservedBlocks = (__local int2*) exclusionRange;