Commit fed50628 authored by Peter Eastman's avatar Peter Eastman
Browse files

Tony's optimizations to reduce local memory use

parent 2dd09317
...@@ -1563,7 +1563,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1563,7 +1563,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, (useLong ? longBornSum->getDeviceBuffer() : bornSum->getDeviceBuffer())); computeBornSumKernel.setArg<cl::Buffer>(index++, (useLong ? longBornSum->getDeviceBuffer() : bornSum->getDeviceBuffer()));
computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*7*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -1585,7 +1584,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1585,7 +1584,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*9*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -1624,19 +1622,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1624,19 +1622,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer());
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(6, cl.getPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(5, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(7, cl.getInvPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(6, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(4, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(8, maxTiles); computeBornSumKernel.setArg<cl_uint>(7, maxTiles);
force1Kernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(10, maxTiles); force1Kernel.setArg<cl_uint>(9, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu) { if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionFlags().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
} }
} }
} }
......
...@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) { if (cutoff != -1.0) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize()); forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize()); forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize());
} }
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
} }
...@@ -349,14 +349,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() { ...@@ -349,14 +349,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize = numTiles; newSize = numTiles;
delete interactingTiles; delete interactingTiles;
interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles"); interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
forceKernel.setArg<cl::Buffer>(9, interactingTiles->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(13, newSize); forceKernel.setArg<cl_uint>(12, newSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, newSize); findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
if (context.getSIMDWidth() == 32 || deviceIsCpu) { if (context.getSIMDWidth() == 32 || deviceIsCpu) {
delete interactionFlags; delete interactionFlags;
interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags"); interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
forceKernel.setArg<cl::Buffer>(14, interactionFlags->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(13, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
...@@ -369,22 +369,22 @@ void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) { ...@@ -369,22 +369,22 @@ void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this->numTiles = numTiles; this->numTiles = numTiles;
if (cutoff == -1.0) if (cutoff == -1.0)
return; // There are no nonbonded interactions in the System. return; // There are no nonbonded interactions in the System.
forceKernel.setArg<cl_uint>(7, startTileIndex); forceKernel.setArg<cl_uint>(6, startTileIndex);
forceKernel.setArg<cl_uint>(8, startTileIndex+numTiles); forceKernel.setArg<cl_uint>(7, startTileIndex+numTiles);
if (useCutoff) { if (useCutoff) {
findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex); findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles); findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
} }
else else
forceKernel.setArg<cl_uint>(9, numTiles); forceKernel.setArg<cl_uint>(8, numTiles);
} }
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const { cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const {
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source; replacements["COMPUTE_INTERACTION"] = source;
int localDataSize = 7*sizeof(cl_float);
const string suffixes[] = {"x", "y", "z", "w"}; const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData; stringstream localData;
int localDataSize = 0;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) if (params[i].getNumComponents() == 1)
localData<<params[i].getType()<<" "<<params[i].getName()<<";\n"; localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
...@@ -394,10 +394,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -394,10 +394,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
} }
localDataSize += params[i].getSize(); localDataSize += params[i].getSize();
} }
if ((localDataSize/4)%2 == 0) {
localData << "float padding;\n";
localDataSize += 4;
}
replacements["ATOM_PARAMETER_DATA"] = localData.str(); replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args; stringstream args;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
...@@ -487,6 +483,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -487,6 +483,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0)
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
string file; string file;
if (deviceIsCpu) if (deviceIsCpu)
file = OpenCLKernelSources::nonbonded_cpu; file = OpenCLKernelSources::nonbonded_cpu;
...@@ -509,7 +507,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -509,7 +507,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize*localDataSize : forceThreadBlockSize*localDataSize), NULL);
kernel.setArg<cl_uint>(index++, startTileIndex); kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles); kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
if (useCutoff) { if (useCutoff) {
......
...@@ -12,7 +12,6 @@ typedef struct { ...@@ -12,7 +12,6 @@ typedef struct {
*/ */
__kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params, __kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else #else
...@@ -27,6 +26,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c ...@@ -27,6 +26,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData1 localData[TILE_SIZE];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
...@@ -196,7 +196,6 @@ typedef struct { ...@@ -196,7 +196,6 @@ typedef struct {
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce, __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else #else
...@@ -212,6 +211,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global ...@@ -212,6 +211,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
......
...@@ -2,9 +2,7 @@ ...@@ -2,9 +2,7 @@
typedef struct { typedef struct {
float x, y, z; float x, y, z;
float q;
float radius, scaledRadius; float radius, scaledRadius;
float bornSum;
} AtomData1; } AtomData1;
/** /**
...@@ -13,7 +11,6 @@ typedef struct { ...@@ -13,7 +11,6 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params, void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
...@@ -28,7 +25,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -28,7 +25,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float tempBuffer[FORCE_WORK_GROUP_SIZE/2]; __local AtomData1 localData[TILE_SIZE];
__local float localBornSum[FORCE_WORK_GROUP_SIZE];
__local float localTemp[TILE_SIZE];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
...@@ -51,7 +50,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -51,7 +50,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
} }
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f; float bornSum = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
...@@ -59,12 +58,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -59,12 +58,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
if (get_local_id(0) < TILE_SIZE) {
localData[get_local_id(0)].x = posq1.x; localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y; localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z; localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x; localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y; localData[get_local_id(0)].scaledRadius = params1.y;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f); float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
...@@ -96,7 +96,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -96,7 +96,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) if (get_local_id(0) >= TILE_SIZE)
tempBuffer[tgx] = bornSum; localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -104,8 +104,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -104,8 +104,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset] += bornSum+tempBuffer[tgx]; global_bornSum[offset] += bornSum+localTemp[tgx];
} }
// barrier not required here as localTemp is not accessed before encountering another barrier.
} }
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
...@@ -116,19 +117,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -116,19 +117,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j]; float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x; localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y; localData[get_local_id(0)].scaledRadius = tempParams.y;
} }
localData[get_local_id(0)].bornSum = 0.0f; localBornSum[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int tj = tgx%(TILE_SIZE/2); unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+tj].x-posq1.x, localData[baseLocalAtom+tj].y-posq1.y, localData[baseLocalAtom+tj].z-posq1.z, 0.0f); float4 delta = (float4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
...@@ -136,13 +136,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -136,13 +136,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else #else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
#endif #endif
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius); float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
{ {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
...@@ -165,16 +165,16 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -165,16 +165,16 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r); term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r);
localData[baseLocalAtom+tj+forceBufferOffset].bornSum += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI); localBornSum[tj+localForceOffset] += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)%(TILE_SIZE/2); tj = (tj+1) & (TILE_SIZE-1);
} }
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) if (get_local_id(0) >= TILE_SIZE)
tempBuffer[tgx] = bornSum; localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -187,22 +187,28 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa ...@@ -187,22 +187,28 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Do both loads before both stores to minimize store-load waits. // Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1]; float sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2]; float sum2 = global_bornSum[offset2];
sum1 += bornSum + tempBuffer[tgx]; sum1 += bornSum + localTemp[tgx];
sum2 += localData[get_local_id(0)].bornSum + localData[get_local_id(0)+TILE_SIZE].bornSum; sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1; global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2; global_bornSum[offset2] = sum2;
} }
barrier(CLK_LOCAL_MEM_FENCE);
} }
lasty = y; lasty = y;
pos++; pos++;
} }
} }
typedef struct {
float x, y, z, w;
float padding;
} PaddedUnalignedFloat4;
typedef struct { typedef struct {
float x, y, z; float x, y, z;
float q; float q;
float fx, fy, fz, fw;
float bornRadius; float bornRadius;
float temp_x, temp_y, temp_z, temp_w;
} AtomData2; } AtomData2;
/** /**
...@@ -212,7 +218,6 @@ typedef struct { ...@@ -212,7 +218,6 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce, void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
...@@ -228,7 +233,8 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -228,7 +233,8 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2]; __local AtomData2 localData[TILE_SIZE];
__local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
...@@ -251,7 +257,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -251,7 +257,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
} }
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
...@@ -259,11 +265,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -259,11 +265,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
if (get_local_id(0) < TILE_SIZE) {
localData[get_local_id(0)].x = posq1.x; localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y; localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z; localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w; localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1; localData[get_local_id(0)].bornRadius = bornRadius1;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS);
...@@ -300,8 +308,12 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -300,8 +308,12 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[tgx] = force; localData[tgx].temp_x = force.x;
localData[tgx].temp_y = force.y;
localData[tgx].temp_z = force.z;
localData[tgx].temp_w = force.w;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -312,11 +324,14 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -312,11 +324,14 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum = forceBuffers[offset]; float4 sum = forceBuffers[offset];
float global_sum = global_bornForce[offset]; float global_sum = global_bornForce[offset];
sum.xyz += force.xyz + tempBuffer[tgx].xyz; sum.x += force.x + localData[tgx].temp_x;
global_sum += force.w + tempBuffer[tgx].w; sum.y += force.y + localData[tgx].temp_y;
sum.z += force.z + localData[tgx].temp_z;
global_sum += force.w + localData[tgx].temp_w;
forceBuffers[offset] = sum; forceBuffers[offset] = sum;
global_bornForce[offset] = global_sum; global_bornForce[offset] = global_sum;
} }
// barrier not required here as localData[*]/temp_* is not accessed before encountering another barrier.
} }
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
...@@ -330,18 +345,18 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -330,18 +345,18 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
localData[get_local_id(0)].q = tempPosq.w; localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j]; localData[get_local_id(0)].bornRadius = global_bornRadii[j];
} }
localData[get_local_id(0)].fx = 0.0f; localForce[get_local_id(0)].x = 0.0f;
localData[get_local_id(0)].fy = 0.0f; localForce[get_local_id(0)].y = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localForce[get_local_id(0)].z = 0.0f;
localData[get_local_id(0)].fw = 0.0f; localForce[get_local_id(0)].w = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int tj = tgx%(TILE_SIZE/2); unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
float4 posq2 = (float4) (localData[baseLocalAtom+tj].x, localData[baseLocalAtom+tj].y, localData[baseLocalAtom+tj].z, localData[baseLocalAtom+tj].q); float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -351,7 +366,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -351,7 +366,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
float bornRadius2 = localData[baseLocalAtom+tj].bornRadius; float bornRadius2 = localData[tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij); float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij); float expTerm = EXP(-D_ij);
...@@ -370,18 +385,22 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -370,18 +385,22 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
energy += select(0.0f, tempEnergy, includeInteraction); energy += select(0.0f, tempEnergy, includeInteraction);
delta.xyz *= select(0.0f, dEdR, includeInteraction); delta.xyz *= select(0.0f, dEdR, includeInteraction);
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x; localForce[tj+localForceOffset].x += delta.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y; localForce[tj+localForceOffset].y += delta.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z; localForce[tj+localForceOffset].z += delta.z;
localData[baseLocalAtom+tj+forceBufferOffset].fw += select(0.0f, dGpol_dalpha2_ij*bornRadius1, includeInteraction); localForce[tj+localForceOffset].w += select(0.0f, dGpol_dalpha2_ij*bornRadius1, includeInteraction);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)%(TILE_SIZE/2); tj = (tj+1) & (TILE_SIZE-1);
} }
// Sum the forces and write results. // Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[tgx] = force; localData[tgx].temp_x = force.x;
localData[tgx].temp_y = force.y;
localData[tgx].temp_z = force.z;
localData[tgx].temp_w = force.w;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...@@ -396,17 +415,20 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r ...@@ -396,17 +415,20 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
float4 sum2 = forceBuffers[offset2]; float4 sum2 = forceBuffers[offset2];
float global_sum1 = global_bornForce[offset1]; float global_sum1 = global_bornForce[offset1];
float global_sum2 = global_bornForce[offset2]; float global_sum2 = global_bornForce[offset2];
sum1.xyz += force.xyz + tempBuffer[tgx].xyz; sum1.x += force.x + localData[tgx].temp_x;
global_sum1 += force.w + tempBuffer[tgx].w; sum1.y += force.y + localData[tgx].temp_y;
sum2.x += localData[get_local_id(0)].fx + localData[get_local_id(0)+TILE_SIZE].fx; sum1.z += force.z + localData[tgx].temp_z;
sum2.y += localData[get_local_id(0)].fy + localData[get_local_id(0)+TILE_SIZE].fy; global_sum1 += force.w + localData[tgx].temp_w;
sum2.z += localData[get_local_id(0)].fz + localData[get_local_id(0)+TILE_SIZE].fz; sum2.x += localForce[get_local_id(0)].x + localForce[get_local_id(0)+TILE_SIZE].x;
global_sum2 += localData[get_local_id(0)].fw + localData[get_local_id(0)+TILE_SIZE].fw; sum2.y += localForce[get_local_id(0)].y + localForce[get_local_id(0)+TILE_SIZE].y;
sum2.z += localForce[get_local_id(0)].z + localForce[get_local_id(0)+TILE_SIZE].z;
global_sum2 += localForce[get_local_id(0)].w + localForce[get_local_id(0)+TILE_SIZE].w;
forceBuffers[offset1] = sum1; forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2; forceBuffers[offset2] = sum2;
global_bornForce[offset1] = global_sum1; global_bornForce[offset1] = global_sum1;
global_bornForce[offset2] = global_sum2; global_bornForce[offset2] = global_sum2;
} }
barrier(CLK_LOCAL_MEM_FENCE);
} }
lasty = y; lasty = y;
pos++; pos++;
......
...@@ -22,7 +22,6 @@ __kernel void computeBornSum( ...@@ -22,7 +22,6 @@ __kernel void computeBornSum(
__global float* restrict global_bornSum, __global float* restrict global_bornSum,
#endif #endif
__global const float4* restrict posq, __global const float2* restrict global_params, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags, __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else #else
...@@ -40,6 +39,7 @@ __kernel void computeBornSum( ...@@ -40,6 +39,7 @@ __kernel void computeBornSum(
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif #endif
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
__local float tempBuffer[FORCE_WORK_GROUP_SIZE]; __local float tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP]; __local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks; __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
...@@ -344,7 +344,6 @@ __kernel void computeGBSAForce1( ...@@ -344,7 +344,6 @@ __kernel void computeGBSAForce1(
__global float4* restrict forceBuffers, __global float* restrict global_bornForce, __global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif #endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags, __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else #else
...@@ -363,6 +362,7 @@ __kernel void computeGBSAForce1( ...@@ -363,6 +362,7 @@ __kernel void computeGBSAForce1(
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE]; __local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP]; __local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks; __local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
......
...@@ -12,7 +12,7 @@ typedef struct { ...@@ -12,7 +12,7 @@ typedef struct {
*/ */
__kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
...@@ -30,6 +30,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -30,6 +30,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData localData[TILE_SIZE];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
......
#define TILE_SIZE 32 #define TILE_SIZE 32
// Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would
// cause every element of array to have DWORD of padding to make it 4 DWORD
// aligned which wastes space and causes LDS bank conflicts as stride is no
// longer odd DWORDS.
typedef struct {
float x, y, z;
} UnalignedFloat3;
typedef struct { typedef struct {
float x, y, z; float x, y, z;
float q; float q;
float fx, fy, fz; float fx, fy, fz;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
float padding;
#endif
} AtomData; } AtomData;
/** /**
...@@ -13,7 +24,7 @@ typedef struct { ...@@ -13,7 +24,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
...@@ -31,9 +42,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -31,9 +42,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local float tempBuffer[3*(FORCE_WORK_GROUP_SIZE/2)]; __local AtomData localData[TILE_SIZE];
__local UnalignedFloat3 localForce[FORCE_WORK_GROUP_SIZE];
#ifdef USE_EXCLUSIONS
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
__local int exclusionIndex[1]; __local int exclusionIndex[1];
#endif
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
...@@ -56,7 +70,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -56,7 +70,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
} }
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
...@@ -79,12 +93,14 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -79,12 +93,14 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0); if (get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
localData[localAtomIndex].x = posq1.x; localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y; localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z; localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w; localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom; unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
...@@ -93,7 +109,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -93,7 +109,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+j; unsigned int atom2 = baseLocalAtom+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -125,14 +141,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -125,14 +141,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Sum the forces and write results. // Sum the forces and write results.
int bufferIndex = 3*tgx;
if (get_local_id(0) >= TILE_SIZE) { if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex] = force.x; localData[tgx].fx = force.x;
tempBuffer[bufferIndex+1] = force.y; localData[tgx].fy = force.y;
tempBuffer[bufferIndex+2] = force.z; localData[tgx].fz = force.z;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
force.x += localData[tgx].fx;
force.y += localData[tgx].fy;
force.z += localData[tgx].fz;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
...@@ -140,15 +158,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -140,15 +158,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#endif #endif
// Cheaper to load/store float4 than float3. // Cheaper to load/store float4 than float3.
float4 sum = forceBuffers[offset]; float4 sum = forceBuffers[offset];
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f); sum.xyz += force.xyz;
forceBuffers[offset] = sum; forceBuffers[offset] = sum;
} }
// barrier not required here as localData[*].temp is not accessed before encountering another barrier.
} }
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0); if (lasty != y && get_local_id(0) < TILE_SIZE) {
if (lasty != y && localAtomIndex < TILE_SIZE) { const unsigned int localAtomIndex = tgx;
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; float4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x; localData[localAtomIndex].x = tempPosq.x;
...@@ -157,26 +176,23 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -157,26 +176,23 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
localData[localAtomIndex].q = tempPosq.w; localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
localData[localAtomIndex].fx = 0.0f; localForce[get_local_id(0)].x = 0.0f;
localData[localAtomIndex].fy = 0.0f; localForce[get_local_id(0)].y = 0.0f;
localData[localAtomIndex].fz = 0.0f; localForce[get_local_id(0)].z = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF; excl = (excl >> tj) | (excl << (TILE_SIZE - tj));
excl += excl << 16;
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+tj; float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -186,8 +202,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -186,8 +202,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
int atom2 = tj;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+tj; atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
...@@ -200,29 +217,28 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -200,29 +217,28 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x; localForce[tj+localForceOffset].x += delta.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y; localForce[tj+localForceOffset].y += delta.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z; localForce[tj+localForceOffset].z += delta.z;
#else #else
force.xyz -= dEdR1.xyz; force.xyz -= dEdR1.xyz;
localData[baseLocalAtom+tj+forceBufferOffset].fx += dEdR2.x; localForce[tj+localForceOffset].x += dEdR2.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += dEdR2.y; localForce[tj+localForceOffset].y += dEdR2.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += dEdR2.z; localForce[tj+localForceOffset].z += dEdR2.z;
#endif #endif
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
tj = (tj+1)%(TILE_SIZE/2); tj = (tj+1) & (TILE_SIZE-1);
} }
// Sum the forces and write results. // Sum the forces and write results.
int bufferIndex = 3*tgx;
if (get_local_id(0) >= TILE_SIZE) { if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex] = force.x; localData[tgx].fx = force.x;
tempBuffer[bufferIndex+1] = force.y; localData[tgx].fy = force.y;
tempBuffer[bufferIndex+2] = force.z; localData[tgx].fz = force.z;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
...@@ -236,11 +252,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re ...@@ -236,11 +252,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1]; float4 sum1 = forceBuffers[offset1];
float4 sum2 = forceBuffers[offset2]; float4 sum2 = forceBuffers[offset2];
sum1 += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f); sum1.x += localData[tgx].fx + force.x;
sum2 += (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f); sum1.y += localData[tgx].fy + force.y;
sum1.z += localData[tgx].fz + force.z;
sum2.x += localForce[tgx].x + localForce[tgx+TILE_SIZE].x;
sum2.y += localForce[tgx].y + localForce[tgx+TILE_SIZE].y;
sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z;
forceBuffers[offset1] = sum1; forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2; forceBuffers[offset2] = sum2;
} }
barrier(CLK_LOCAL_MEM_FENCE);
} }
lasty = y; lasty = y;
pos++; pos++;
......
...@@ -10,6 +10,9 @@ typedef struct { ...@@ -10,6 +10,9 @@ typedef struct {
float q; float q;
float fx, fy, fz; float fx, fy, fz;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
float padding;
#endif
} AtomData; } AtomData;
/** /**
...@@ -22,7 +25,7 @@ __kernel void computeNonbonded( ...@@ -22,7 +25,7 @@ __kernel void computeNonbonded(
__global float4* restrict forceBuffers, __global float4* restrict forceBuffers,
#endif #endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
...@@ -41,6 +44,7 @@ __kernel void computeNonbonded( ...@@ -41,6 +44,7 @@ __kernel void computeNonbonded(
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps; unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif #endif
float energy = 0.0f; float energy = 0.0f;
__local AtomData localData[FORCE_WORK_GROUP_SIZE];
__local float tempBuffer[3*FORCE_WORK_GROUP_SIZE]; __local float tempBuffer[3*FORCE_WORK_GROUP_SIZE];
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment