Commit fed50628 authored by Peter Eastman's avatar Peter Eastman
Browse files

Tony's optimizations to reduce local memory use

parent 2dd09317
......@@ -1563,7 +1563,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, (useLong ? longBornSum->getDeviceBuffer() : bornSum->getDeviceBuffer()));
computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*7*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -1585,7 +1584,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*9*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -1624,19 +1622,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer());
}
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(6, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(7, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(5, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(6, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(8, maxTiles);
force1Kernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(10, maxTiles);
computeBornSumKernel.setArg<cl::Buffer>(4, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(7, maxTiles);
force1Kernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(9, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
}
}
}
......
......@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) {
if (useCutoff) {
forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize());
forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize());
}
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
......@@ -349,14 +349,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize = numTiles;
delete interactingTiles;
interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
forceKernel.setArg<cl::Buffer>(9, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(13, newSize);
forceKernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(12, newSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
if (context.getSIMDWidth() == 32 || deviceIsCpu) {
delete interactionFlags;
interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
forceKernel.setArg<cl::Buffer>(14, interactionFlags->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(13, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
......@@ -369,22 +369,22 @@ void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this->numTiles = numTiles;
if (cutoff == -1.0)
return; // There are no nonbonded interactions in the System.
forceKernel.setArg<cl_uint>(7, startTileIndex);
forceKernel.setArg<cl_uint>(8, startTileIndex+numTiles);
forceKernel.setArg<cl_uint>(6, startTileIndex);
forceKernel.setArg<cl_uint>(7, startTileIndex+numTiles);
if (useCutoff) {
findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
}
else
forceKernel.setArg<cl_uint>(9, numTiles);
forceKernel.setArg<cl_uint>(8, numTiles);
}
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const {
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source;
int localDataSize = 7*sizeof(cl_float);
const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData;
int localDataSize = 0;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1)
localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
......@@ -394,10 +394,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
}
localDataSize += params[i].getSize();
}
if ((localDataSize/4)%2 == 0) {
localData << "float padding;\n";
localDataSize += 4;
}
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args;
for (int i = 0; i < (int) params.size(); i++) {
......@@ -487,6 +483,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0)
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::nonbonded_cpu;
......@@ -509,7 +507,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize*localDataSize : forceThreadBlockSize*localDataSize), NULL);
kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
if (useCutoff) {
......
......@@ -12,7 +12,6 @@ typedef struct {
*/
__kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
......@@ -27,6 +26,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
unsigned int lasty = 0xFFFFFFFF;
__local AtomData1 localData[TILE_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
......@@ -196,7 +196,6 @@ typedef struct {
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
......@@ -212,6 +211,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
......
......@@ -2,9 +2,7 @@
typedef struct {
float x, y, z;
float q;
float radius, scaledRadius;
float bornSum;
} AtomData1;
/**
......@@ -13,7 +11,6 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
......@@ -28,7 +25,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
unsigned int lasty = 0xFFFFFFFF;
__local float tempBuffer[FORCE_WORK_GROUP_SIZE/2];
__local AtomData1 localData[TILE_SIZE];
__local float localBornSum[FORCE_WORK_GROUP_SIZE];
__local float localTemp[TILE_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
......@@ -51,7 +50,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
......@@ -59,12 +58,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
if (get_local_id(0) < TILE_SIZE) {
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
......@@ -96,7 +96,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[tgx] = bornSum;
localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
......@@ -104,8 +104,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum+tempBuffer[tgx];
global_bornSum[offset] += bornSum+localTemp[tgx];
}
// barrier not required here as localTemp is not accessed before encountering another barrier.
}
else {
// This is an off-diagonal tile.
......@@ -116,19 +117,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
localData[get_local_id(0)].bornSum = 0.0f;
localBornSum[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int tj = tgx%(TILE_SIZE/2);
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+tj].x-posq1.x, localData[baseLocalAtom+tj].y-posq1.y, localData[baseLocalAtom+tj].z-posq1.z, 0.0f);
float4 delta = (float4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
......@@ -136,13 +136,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius);
float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
{
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
......@@ -165,16 +165,16 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r);
localData[baseLocalAtom+tj+forceBufferOffset].bornSum += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
localBornSum[tj+localForceOffset] += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
}
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)%(TILE_SIZE/2);
tj = (tj+1) & (TILE_SIZE-1);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[tgx] = bornSum;
localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
......@@ -187,22 +187,28 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2];
sum1 += bornSum + tempBuffer[tgx];
sum2 += localData[get_local_id(0)].bornSum + localData[get_local_id(0)+TILE_SIZE].bornSum;
sum1 += bornSum + localTemp[tgx];
sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
lasty = y;
pos++;
}
}
typedef struct {
float x, y, z, w;
float padding;
} PaddedUnalignedFloat4;
typedef struct {
float x, y, z;
float q;
float fx, fy, fz, fw;
float bornRadius;
float temp_x, temp_y, temp_z, temp_w;
} AtomData2;
/**
......@@ -212,7 +218,6 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
......@@ -228,7 +233,8 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2];
__local AtomData2 localData[TILE_SIZE];
__local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
......@@ -251,7 +257,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
......@@ -259,11 +265,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
if (get_local_id(0) < TILE_SIZE) {
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS);
......@@ -300,8 +308,12 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[tgx] = force;
if (get_local_id(0) >= TILE_SIZE) {
localData[tgx].temp_x = force.x;
localData[tgx].temp_y = force.y;
localData[tgx].temp_z = force.z;
localData[tgx].temp_w = force.w;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
......@@ -312,11 +324,14 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum = forceBuffers[offset];
float global_sum = global_bornForce[offset];
sum.xyz += force.xyz + tempBuffer[tgx].xyz;
global_sum += force.w + tempBuffer[tgx].w;
sum.x += force.x + localData[tgx].temp_x;
sum.y += force.y + localData[tgx].temp_y;
sum.z += force.z + localData[tgx].temp_z;
global_sum += force.w + localData[tgx].temp_w;
forceBuffers[offset] = sum;
global_bornForce[offset] = global_sum;
}
// barrier not required here as localData[*]/temp_* is not accessed before encountering another barrier.
}
else {
// This is an off-diagonal tile.
......@@ -330,18 +345,18 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
}
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
localForce[get_local_id(0)].x = 0.0f;
localForce[get_local_id(0)].y = 0.0f;
localForce[get_local_id(0)].z = 0.0f;
localForce[get_local_id(0)].w = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int tj = tgx%(TILE_SIZE/2);
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS);
float4 posq2 = (float4) (localData[baseLocalAtom+tj].x, localData[baseLocalAtom+tj].y, localData[baseLocalAtom+tj].z, localData[baseLocalAtom+tj].q);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
......@@ -351,7 +366,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[baseLocalAtom+tj].bornRadius;
float bornRadius2 = localData[tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
......@@ -370,18 +385,22 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
energy += select(0.0f, tempEnergy, includeInteraction);
delta.xyz *= select(0.0f, dEdR, includeInteraction);
force.xyz -= delta.xyz;
localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z;
localData[baseLocalAtom+tj+forceBufferOffset].fw += select(0.0f, dGpol_dalpha2_ij*bornRadius1, includeInteraction);
localForce[tj+localForceOffset].x += delta.x;
localForce[tj+localForceOffset].y += delta.y;
localForce[tj+localForceOffset].z += delta.z;
localForce[tj+localForceOffset].w += select(0.0f, dGpol_dalpha2_ij*bornRadius1, includeInteraction);
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)%(TILE_SIZE/2);
tj = (tj+1) & (TILE_SIZE-1);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[tgx] = force;
if (get_local_id(0) >= TILE_SIZE) {
localData[tgx].temp_x = force.x;
localData[tgx].temp_y = force.y;
localData[tgx].temp_z = force.z;
localData[tgx].temp_w = force.w;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
......@@ -396,17 +415,20 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
float4 sum2 = forceBuffers[offset2];
float global_sum1 = global_bornForce[offset1];
float global_sum2 = global_bornForce[offset2];
sum1.xyz += force.xyz + tempBuffer[tgx].xyz;
global_sum1 += force.w + tempBuffer[tgx].w;
sum2.x += localData[get_local_id(0)].fx + localData[get_local_id(0)+TILE_SIZE].fx;
sum2.y += localData[get_local_id(0)].fy + localData[get_local_id(0)+TILE_SIZE].fy;
sum2.z += localData[get_local_id(0)].fz + localData[get_local_id(0)+TILE_SIZE].fz;
global_sum2 += localData[get_local_id(0)].fw + localData[get_local_id(0)+TILE_SIZE].fw;
sum1.x += force.x + localData[tgx].temp_x;
sum1.y += force.y + localData[tgx].temp_y;
sum1.z += force.z + localData[tgx].temp_z;
global_sum1 += force.w + localData[tgx].temp_w;
sum2.x += localForce[get_local_id(0)].x + localForce[get_local_id(0)+TILE_SIZE].x;
sum2.y += localForce[get_local_id(0)].y + localForce[get_local_id(0)+TILE_SIZE].y;
sum2.z += localForce[get_local_id(0)].z + localForce[get_local_id(0)+TILE_SIZE].z;
global_sum2 += localForce[get_local_id(0)].w + localForce[get_local_id(0)+TILE_SIZE].w;
forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2;
global_bornForce[offset1] = global_sum1;
global_bornForce[offset2] = global_sum2;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
lasty = y;
pos++;
......
......@@ -22,7 +22,6 @@ __kernel void computeBornSum(
__global float* restrict global_bornSum,
#endif
__global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
......@@ -40,6 +39,7 @@ __kernel void computeBornSum(
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
unsigned int lasty = 0xFFFFFFFF;
__local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
__local float tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
......@@ -344,7 +344,6 @@ __kernel void computeGBSAForce1(
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
......@@ -363,6 +362,7 @@ __kernel void computeGBSAForce1(
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
......
......@@ -12,7 +12,7 @@ typedef struct {
*/
__kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
......@@ -30,6 +30,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData localData[TILE_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
......
#define TILE_SIZE 32
// Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would
// cause every element of array to have DWORD of padding to make it 4 DWORD
// aligned which wastes space and causes LDS bank conflicts as stride is no
// longer odd DWORDS.
typedef struct {
float x, y, z;
} UnalignedFloat3;
typedef struct {
float x, y, z;
float q;
float fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
float padding;
#endif
} AtomData;
/**
......@@ -13,7 +24,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
......@@ -31,9 +42,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local float tempBuffer[3*(FORCE_WORK_GROUP_SIZE/2)];
__local AtomData localData[TILE_SIZE];
__local UnalignedFloat3 localForce[FORCE_WORK_GROUP_SIZE];
#ifdef USE_EXCLUSIONS
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
#endif
while (pos < end) {
// Extract the coordinates of this tile
......@@ -56,7 +70,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
......@@ -79,12 +93,14 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
if (get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
}
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
......@@ -93,7 +109,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
unsigned int atom2 = baseLocalAtom+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
......@@ -125,14 +141,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Sum the forces and write results.
int bufferIndex = 3*tgx;
if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex] = force.x;
tempBuffer[bufferIndex+1] = force.y;
tempBuffer[bufferIndex+2] = force.z;
localData[tgx].fx = force.x;
localData[tgx].fy = force.y;
localData[tgx].fz = force.z;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
force.x += localData[tgx].fx;
force.y += localData[tgx].fy;
force.z += localData[tgx].fz;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
......@@ -140,15 +158,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#endif
// Cheaper to load/store float4 than float3.
float4 sum = forceBuffers[offset];
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
sum.xyz += force.xyz;
forceBuffers[offset] = sum;
}
// barrier not required here as localData[*].temp is not accessed before encountering another barrier.
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y && localAtomIndex < TILE_SIZE) {
if (lasty != y && get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
......@@ -157,26 +176,23 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
localForce[get_local_id(0)].x = 0.0f;
localForce[get_local_id(0)].y = 0.0f;
localForce[get_local_id(0)].z = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
excl = (excl >> tj) | (excl << (TILE_SIZE - tj));
#endif
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+tj;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
......@@ -186,8 +202,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
int atom2 = tj;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+tj;
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
#else
......@@ -200,29 +217,28 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z;
localForce[tj+localForceOffset].x += delta.x;
localForce[tj+localForceOffset].y += delta.y;
localForce[tj+localForceOffset].z += delta.z;
#else
force.xyz -= dEdR1.xyz;
localData[baseLocalAtom+tj+forceBufferOffset].fx += dEdR2.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += dEdR2.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += dEdR2.z;
localForce[tj+localForceOffset].x += dEdR2.x;
localForce[tj+localForceOffset].y += dEdR2.y;
localForce[tj+localForceOffset].z += dEdR2.z;
#endif
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj+1)%(TILE_SIZE/2);
tj = (tj+1) & (TILE_SIZE-1);
}
// Sum the forces and write results.
int bufferIndex = 3*tgx;
if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex] = force.x;
tempBuffer[bufferIndex+1] = force.y;
tempBuffer[bufferIndex+2] = force.z;
localData[tgx].fx = force.x;
localData[tgx].fy = force.y;
localData[tgx].fz = force.z;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
......@@ -236,11 +252,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1];
float4 sum2 = forceBuffers[offset2];
sum1 += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
sum2 += (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f);
sum1.x += localData[tgx].fx + force.x;
sum1.y += localData[tgx].fy + force.y;
sum1.z += localData[tgx].fz + force.z;
sum2.x += localForce[tgx].x + localForce[tgx+TILE_SIZE].x;
sum2.y += localForce[tgx].y + localForce[tgx+TILE_SIZE].y;
sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z;
forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
lasty = y;
pos++;
......
......@@ -10,6 +10,9 @@ typedef struct {
float q;
float fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
float padding;
#endif
} AtomData;
/**
......@@ -22,7 +25,7 @@ __kernel void computeNonbonded(
__global float4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
......@@ -41,6 +44,7 @@ __kernel void computeNonbonded(
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f;
__local AtomData localData[FORCE_WORK_GROUP_SIZE];
__local float tempBuffer[3*FORCE_WORK_GROUP_SIZE];
__local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment