Commit cd6cb66a authored by Peter Eastman's avatar Peter Eastman
Browse files

Restructured local memory in nonbonded kernel to reduce bank conflicts

parent 466f7ac1
...@@ -282,16 +282,23 @@ void OpenCLNonbondedUtilities::computeInteractions() { ...@@ -282,16 +282,23 @@ void OpenCLNonbondedUtilities::computeInteractions() {
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions) const { cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions) const {
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source; replacements["COMPUTE_INTERACTION"] = source;
int localDataSize = 2*sizeof(cl_float4);
stringstream localData;
for (int i = 0; i < (int) params.size(); i++) {
localData << params[i].getType() << " " << params[i].getName() << ";\n";
localDataSize += params[i].getSize();
}
if ((localDataSize/4)%2 == 0) {
localData << "float padding;\n";
localDataSize += 4;
}
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args; stringstream args;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
args << ", __global "; args << ", __global ";
args << params[i].getType(); args << params[i].getType();
args << "* global_"; args << "* global_";
args << params[i].getName(); args << params[i].getName();
args << ", __local ";
args << params[i].getType();
args << "* local_";
args << params[i].getName();
} }
for (int i = 0; i < (int) arguments.size(); i++) { for (int i = 0; i < (int) arguments.size(); i++) {
if (arguments[i].getMemory().getInfo<CL_MEM_TYPE>() == CL_MEM_OBJECT_IMAGE2D) { if (arguments[i].getMemory().getInfo<CL_MEM_TYPE>() == CL_MEM_OBJECT_IMAGE2D) {
...@@ -311,18 +318,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -311,18 +318,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["PARAMETER_ARGUMENTS"] = args.str(); replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream loadLocal1; stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
loadLocal1 << "local_"; loadLocal1 << "localData[get_local_id(0)].";
loadLocal1 << params[i].getName(); loadLocal1 << params[i].getName();
loadLocal1 << "[get_local_id(0)] = "; loadLocal1 << " = ";
loadLocal1 << params[i].getName(); loadLocal1 << params[i].getName();
loadLocal1 << "1;\n"; loadLocal1 << "1;\n";
} }
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream loadLocal2; stringstream loadLocal2;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
loadLocal2 << "local_"; loadLocal2 << "localData[get_local_id(0)].";
loadLocal2 << params[i].getName(); loadLocal2 << params[i].getName();
loadLocal2 << "[get_local_id(0)] = global_"; loadLocal2 << " = global_";
loadLocal2 << params[i].getName(); loadLocal2 << params[i].getName();
loadLocal2 << "[j];\n"; loadLocal2 << "[j];\n";
} }
...@@ -342,9 +349,9 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -342,9 +349,9 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
load2j << params[i].getType(); load2j << params[i].getType();
load2j << " "; load2j << " ";
load2j << params[i].getName(); load2j << params[i].getName();
load2j << "2 = local_"; load2j << "2 = localData[atom2].";
load2j << params[i].getName(); load2j << params[i].getName();
load2j << "[atom2];\n"; load2j << ";\n";
} }
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
map<string, string> defines; map<string, string> defines;
...@@ -374,32 +381,28 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -374,32 +381,28 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
// Set arguments to the Kernel. // Set arguments to the Kernel.
kernel.setArg<cl::Buffer>(0, context.getForceBuffers().getDeviceBuffer()); int index = 0;
kernel.setArg<cl::Buffer>(1, context.getEnergyBuffer().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(3, exclusions->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(4, exclusionIndex->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
kernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); kernel.setArg<cl::Buffer>(index++, exclusionIndex->getDeviceBuffer());
kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); kernel.setArg(index++, OpenCLContext::ThreadBlockSize*(2*sizeof(cl_float4)+localDataSize), NULL);
kernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
int paramBase = 10;
if (useCutoff) { if (useCutoff) {
paramBase = 11; kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactionFlags->getDeviceBuffer());
kernel.setArg<cl::Buffer>(9, interactionFlags->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
kernel.setArg<cl::Buffer>(10, interactionCount->getDeviceBuffer());
} }
else { else {
kernel.setArg<cl::Buffer>(8, tiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, tiles->getDeviceBuffer());
kernel.setArg<cl_uint>(9, tiles->getSize()); kernel.setArg<cl_uint>(index++, tiles->getSize());
} }
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Memory>(i*2+paramBase, params[i].getMemory()); kernel.setArg<cl::Memory>(index++, params[i].getMemory());
kernel.setArg(i*2+paramBase+1, OpenCLContext::ThreadBlockSize*params[i].getSize(), NULL);
} }
paramBase += 2*params.size();
for (int i = 0; i < (int) arguments.size(); i++) { for (int i = 0; i < (int) arguments.size(); i++) {
kernel.setArg<cl::Memory>(i+paramBase, arguments[i].getMemory()); kernel.setArg<cl::Memory>(index++, arguments[i].getMemory());
} }
return kernel; return kernel;
} }
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct {
float4 posq;
float4 force;
ATOM_PARAMETER_DATA
} AtomData;
/** /**
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __local float4* tempBuffer, __global unsigned int* tiles, __global unsigned int* exclusionIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount __global unsigned int* interactionFlags, __global unsigned int* interactionCount
#else #else
...@@ -36,7 +42,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -36,7 +42,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; localData[get_local_id(0)].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE; unsigned int xi = x/TILE_SIZE;
...@@ -49,7 +55,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -49,7 +55,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+j; int atom2 = baseLocalAtom+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -89,10 +95,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -89,10 +95,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j]; localData[get_local_id(0)].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
local_force[get_local_id(0)] = 0.0f; localData[get_local_id(0)].force = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
...@@ -111,7 +117,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -111,7 +117,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+tj; int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2]; float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -129,7 +135,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -129,7 +135,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
local_force[baseLocalAtom+tj+forceBufferOffset].xyz += delta.xyz; localData[baseLocalAtom+tj+forceBufferOffset].force.xyz += delta.xyz;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
excl >>= 1; excl >>= 1;
tj = (tj+1)%(TILE_SIZE/2); tj = (tj+1)%(TILE_SIZE/2);
...@@ -149,7 +155,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -149,7 +155,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+localData[get_local_id(0)].force.xyz+localData[get_local_id(0)+TILE_SIZE].force.xyz;
} }
lasty = y; lasty = y;
} }
......
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct {
float4 posq;
float4 force;
ATOM_PARAMETER_DATA
} AtomData;
/** /**
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __local float4* tempBuffer, __global unsigned int* tiles, __global unsigned int* exclusionIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount __global unsigned int* interactionFlags, __global unsigned int* interactionCount
#else #else
...@@ -37,7 +43,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -37,7 +43,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; localData[get_local_id(0)].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE; unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
...@@ -49,7 +55,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -49,7 +55,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -83,13 +89,13 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -83,13 +89,13 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j]; localData[get_local_id(0)].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
local_force[get_local_id(0)] = 0.0f; localData[get_local_id(0)].force = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = interactionFlags[pos];
if (!hasExclusions && flags != 0xFFFFFFFF) { if (!hasExclusions && flags != 0xFFFFFFFF && flags == 0) {
if (flags == 0) { if (flags == 0) {
// No interactions in this tile. // No interactions in this tile.
} }
...@@ -100,7 +106,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -100,7 +106,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
bool isExcluded = false; bool isExcluded = false;
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -131,7 +137,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -131,7 +137,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (tgx % 16 == 0) if (tgx % 16 == 0)
tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+8].xyz; tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+8].xyz;
if (tgx == 0) if (tgx == 0)
local_force[tbx+j].xyz += tempBuffer[get_local_id(0)].xyz + tempBuffer[get_local_id(0)+16].xyz; localData[tbx+j].force.xyz += tempBuffer[get_local_id(0)].xyz + tempBuffer[get_local_id(0)+16].xyz;
} }
} }
} }
...@@ -154,7 +160,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -154,7 +160,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2]; float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -172,7 +178,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -172,7 +178,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
local_force[tbx+tj].xyz += delta.xyz; localData[tbx+tj].force.xyz += delta.xyz;
excl >>= 1; excl >>= 1;
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
...@@ -187,7 +193,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -187,7 +193,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz; forceBuffers[offset2].xyz += localData[get_local_id(0)].force.xyz;
lasty = y; lasty = y;
} }
pos++; pos++;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment