Commit ce74c6ae authored by Peter Eastman's avatar Peter Eastman
Browse files

More changes from Tony to reduce local memory use

parent 3f64d970
...@@ -1517,8 +1517,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1517,8 +1517,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
if (cl.getSIMDWidth() == 32) defines["FORCE_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(nb.getForceThreadBlockSize());
defines["WARPS_PER_GROUP"] = OpenCLExpressionUtilities::intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
string file; string file;
if (deviceIsCpu) if (deviceIsCpu)
file = OpenCLKernelSources::gbsaObc_cpu; file = OpenCLKernelSources::gbsaObc_cpu;
...@@ -1534,7 +1533,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1534,7 +1533,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*7*sizeof(cl_float), NULL); computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*7*sizeof(cl_float), NULL);
computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -1557,7 +1555,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1557,7 +1555,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*9*sizeof(cl_float), NULL); force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*9*sizeof(cl_float), NULL);
force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -1596,19 +1593,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1596,19 +1593,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer());
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(6, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(7, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(9, cl.getPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(10, cl.getInvPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(9, maxTiles); computeBornSumKernel.setArg<cl_uint>(8, maxTiles);
force1Kernel.setArg<cl::Buffer>(7, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(11, maxTiles); force1Kernel.setArg<cl_uint>(10, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu) { if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
computeBornSumKernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(12, nb.getInteractionFlags().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer());
} }
} }
} }
......
...@@ -480,7 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -480,7 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_EXCLUSIONS"] = "1"; defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric) if (isSymmetric)
defines["USE_SYMMETRIC"] = "1"; defines["USE_SYMMETRIC"] = "1";
defines["NONBONDED_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize); defines["FORCE_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff); defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
......
...@@ -15,7 +15,7 @@ typedef struct { ...@@ -15,7 +15,7 @@ typedef struct {
*/ */
__kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params, __kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData* restrict localData, __local float* restrict tempBuffer, __local AtomData* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else #else
...@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c ...@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer,
__global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce,
__local AtomData* restrict localData, __local float4* restrict tempBuffer, __local AtomData* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else #else
......
...@@ -11,9 +11,9 @@ typedef struct { ...@@ -11,9 +11,9 @@ typedef struct {
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params, void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData, __local float* restrict tempBuffer, __local AtomData1* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
...@@ -28,6 +28,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -28,6 +28,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float tempBuffer[FORCE_WORK_GROUP_SIZE/2];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
...@@ -94,9 +95,8 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -94,9 +95,8 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = bornSum; tempBuffer[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -104,7 +104,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -104,7 +104,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE]; global_bornSum[offset] += bornSum+tempBuffer[tgx];
} }
} }
else { else {
...@@ -174,7 +174,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -174,7 +174,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = bornSum; tempBuffer[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -184,8 +184,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -184,8 +184,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE]; // Do both loads before both stores to minimize store-load waits.
global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum; float sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2];
sum1 += bornSum + tempBuffer[tgx];
sum2 += localData[get_local_id(0)].bornSum + localData[get_local_id(0)+TILE_SIZE].bornSum;
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
} }
} }
lasty = y; lasty = y;
...@@ -204,10 +209,10 @@ typedef struct { ...@@ -204,10 +209,10 @@ typedef struct {
* First part of computing the GBSA interaction. * First part of computing the GBSA interaction.
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce, void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData, __local float4* restrict tempBuffer, __local AtomData2* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
...@@ -223,6 +228,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -223,6 +228,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
...@@ -295,7 +301,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -295,7 +301,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = force; tempBuffer[tgx] = force;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -303,8 +309,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -303,8 +309,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
global_bornForce[offset] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w; float4 sum = forceBuffers[offset];
float global_sum = global_bornForce[offset];
sum.xyz += force.xyz + tempBuffer[tgx].xyz;
global_sum += force.w + tempBuffer[tgx].w;
forceBuffers[offset] = sum;
global_bornForce[offset] = global_sum;
} }
} }
else { else {
...@@ -370,7 +381,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -370,7 +381,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = force; tempBuffer[tgx] = force;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -380,14 +391,21 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -380,14 +391,21 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, float4 sum1 = forceBuffers[offset1];
localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, float4 sum2 = forceBuffers[offset2];
localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, float global_sum1 = global_bornForce[offset2];
localData[get_local_id(0)].fw+localData[get_local_id(0)+TILE_SIZE].fw); float global_sum2 = global_bornForce[offset2];
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+sum.xyz; sum1.xyz += force.xyz + tempBuffer[tgx].xyz;
global_bornForce[offset1] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w; global_sum1 += force.w + tempBuffer[tgx].w;
global_bornForce[offset2] += sum.w; sum2.x += localData[get_local_id(0)].fx + localData[get_local_id(0)+TILE_SIZE].fx;
sum2.y += localData[get_local_id(0)].fy + localData[get_local_id(0)+TILE_SIZE].fy;
sum2.z += localData[get_local_id(0)].fz + localData[get_local_id(0)+TILE_SIZE].fz;
global_sum2 += localData[get_local_id(0)].fw + localData[get_local_id(0)+TILE_SIZE].fw;
forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2;
global_bornForce[offset1] = global_sum1;
global_bornForce[offset2] = global_sum2;
} }
} }
lasty = y; lasty = y;
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif #endif
#define TILE_SIZE 32 #define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct { typedef struct {
float x, y, z; float x, y, z;
...@@ -21,7 +22,7 @@ __kernel void computeBornSum( ...@@ -21,7 +22,7 @@ __kernel void computeBornSum(
__global float* restrict global_bornSum, __global float* restrict global_bornSum,
#endif #endif
__global const float4* restrict posq, __global const float2* restrict global_params, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData, __local float* restrict tempBuffer, __local AtomData1* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags, __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else #else
...@@ -39,6 +40,7 @@ __kernel void computeBornSum( ...@@ -39,6 +40,7 @@ __kernel void computeBornSum(
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif #endif
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP]; __local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks; __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
...@@ -342,7 +344,7 @@ __kernel void computeGBSAForce1( ...@@ -342,7 +344,7 @@ __kernel void computeGBSAForce1(
__global float4* restrict forceBuffers, __global float* restrict global_bornForce, __global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif #endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData, __local float4* restrict tempBuffer, __local AtomData2* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags, __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else #else
...@@ -361,6 +363,7 @@ __kernel void computeGBSAForce1( ...@@ -361,6 +363,7 @@ __kernel void computeGBSAForce1(
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP]; __local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks; __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
......
...@@ -11,7 +11,7 @@ typedef struct { ...@@ -11,7 +11,7 @@ typedef struct {
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel __attribute__((reqd_work_group_size(NONBONDED_WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
...@@ -31,7 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -31,7 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float tempBuffer[3*(NONBONDED_WORK_GROUP_SIZE/2)]; __local float tempBuffer[3*(FORCE_WORK_GROUP_SIZE/2)];
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
__local int exclusionIndex[1]; __local int exclusionIndex[1];
...@@ -138,6 +138,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -138,6 +138,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3.
float4 sum = forceBuffers[offset]; float4 sum = forceBuffers[offset];
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f); sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
forceBuffers[offset] = sum; forceBuffers[offset] = sum;
...@@ -232,7 +233,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -232,7 +233,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3. Do both loads before both stores to minimize store-load waits. // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1]; float4 sum1 = forceBuffers[offset1];
float4 sum2 = forceBuffers[offset2]; float4 sum2 = forceBuffers[offset2];
sum1 += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f); sum1 += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif #endif
#define TILE_SIZE 32 #define TILE_SIZE 32
#define WARPS_PER_GROUP (NONBONDED_WORK_GROUP_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct { typedef struct {
float x, y, z; float x, y, z;
...@@ -41,7 +41,7 @@ __kernel void computeNonbonded( ...@@ -41,7 +41,7 @@ __kernel void computeNonbonded(
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps; unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif #endif
float energy = 0.0f; float energy = 0.0f;
__local float tempBuffer[3*NONBONDED_WORK_GROUP_SIZE]; __local float tempBuffer[3*FORCE_WORK_GROUP_SIZE];
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange; __local int2* reservedBlocks = (__local int2*) exclusionRange;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment