Commit cd6cb66a authored by Peter Eastman's avatar Peter Eastman
Browse files

Restructured local memory in nonbonded kernel to reduce bank conflicts

parent 466f7ac1
......@@ -282,16 +282,23 @@ void OpenCLNonbondedUtilities::computeInteractions() {
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions) const {
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source;
int localDataSize = 2*sizeof(cl_float4);
stringstream localData;
for (int i = 0; i < (int) params.size(); i++) {
localData << params[i].getType() << " " << params[i].getName() << ";\n";
localDataSize += params[i].getSize();
}
if ((localDataSize/4)%2 == 0) {
localData << "float padding;\n";
localDataSize += 4;
}
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args;
for (int i = 0; i < (int) params.size(); i++) {
args << ", __global ";
args << params[i].getType();
args << "* global_";
args << params[i].getName();
args << ", __local ";
args << params[i].getType();
args << "* local_";
args << params[i].getName();
}
for (int i = 0; i < (int) arguments.size(); i++) {
if (arguments[i].getMemory().getInfo<CL_MEM_TYPE>() == CL_MEM_OBJECT_IMAGE2D) {
......@@ -311,18 +318,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) {
loadLocal1 << "local_";
loadLocal1 << "localData[get_local_id(0)].";
loadLocal1 << params[i].getName();
loadLocal1 << "[get_local_id(0)] = ";
loadLocal1 << " = ";
loadLocal1 << params[i].getName();
loadLocal1 << "1;\n";
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream loadLocal2;
for (int i = 0; i < (int) params.size(); i++) {
loadLocal2 << "local_";
loadLocal2 << "localData[get_local_id(0)].";
loadLocal2 << params[i].getName();
loadLocal2 << "[get_local_id(0)] = global_";
loadLocal2 << " = global_";
loadLocal2 << params[i].getName();
loadLocal2 << "[j];\n";
}
......@@ -342,9 +349,9 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
load2j << params[i].getType();
load2j << " ";
load2j << params[i].getName();
load2j << "2 = local_";
load2j << "2 = localData[atom2].";
load2j << params[i].getName();
load2j << "[atom2];\n";
load2j << ";\n";
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
map<string, string> defines;
......@@ -374,32 +381,28 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
// Set arguments to the Kernel.
kernel.setArg<cl::Buffer>(0, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(1, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(3, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(4, exclusionIndex->getDeviceBuffer());
kernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
kernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
int paramBase = 10;
int index = 0;
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionIndex->getDeviceBuffer());
kernel.setArg(index++, OpenCLContext::ThreadBlockSize*(2*sizeof(cl_float4)+localDataSize), NULL);
kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
if (useCutoff) {
paramBase = 11;
kernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(9, interactionFlags->getDeviceBuffer());
kernel.setArg<cl::Buffer>(10, interactionCount->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionFlags->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
}
else {
kernel.setArg<cl::Buffer>(8, tiles->getDeviceBuffer());
kernel.setArg<cl_uint>(9, tiles->getSize());
kernel.setArg<cl::Buffer>(index++, tiles->getDeviceBuffer());
kernel.setArg<cl_uint>(index++, tiles->getSize());
}
for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Memory>(i*2+paramBase, params[i].getMemory());
kernel.setArg(i*2+paramBase+1, OpenCLContext::ThreadBlockSize*params[i].getSize(), NULL);
kernel.setArg<cl::Memory>(index++, params[i].getMemory());
}
paramBase += 2*params.size();
for (int i = 0; i < (int) arguments.size(); i++) {
kernel.setArg<cl::Memory>(i+paramBase, arguments[i].getMemory());
kernel.setArg<cl::Memory>(index++, arguments[i].getMemory());
}
return kernel;
}
#define TILE_SIZE 32
typedef struct {
float4 posq;
float4 force;
ATOM_PARAMETER_DATA
} AtomData;
/**
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __local float4* tempBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount
#else
......@@ -36,7 +42,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
localData[get_local_id(0)].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
......@@ -49,7 +55,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
float4 posq2 = local_posq[atom2];
float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -89,10 +95,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
localData[get_local_id(0)].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[get_local_id(0)] = 0.0f;
localData[get_local_id(0)].force = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
......@@ -111,7 +117,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2];
float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -129,7 +135,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
local_force[baseLocalAtom+tj+forceBufferOffset].xyz += delta.xyz;
localData[baseLocalAtom+tj+forceBufferOffset].force.xyz += delta.xyz;
barrier(CLK_LOCAL_MEM_FENCE);
excl >>= 1;
tj = (tj+1)%(TILE_SIZE/2);
......@@ -149,7 +155,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+localData[get_local_id(0)].force.xyz+localData[get_local_id(0)+TILE_SIZE].force.xyz;
}
lasty = y;
}
......
#define TILE_SIZE 32
typedef struct {
float4 posq;
float4 force;
ATOM_PARAMETER_DATA
} AtomData;
/**
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __local float4* tempBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount
#else
......@@ -37,7 +43,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
localData[get_local_id(0)].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
......@@ -49,7 +55,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -83,13 +89,13 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (lasty != y) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
localData[get_local_id(0)].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[get_local_id(0)] = 0.0f;
localData[get_local_id(0)].force = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos];
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (!hasExclusions && flags != 0xFFFFFFFF && flags == 0) {
if (flags == 0) {
// No interactions in this tile.
}
......@@ -100,7 +106,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -131,7 +137,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+8].xyz;
if (tgx == 0)
local_force[tbx+j].xyz += tempBuffer[get_local_id(0)].xyz + tempBuffer[get_local_id(0)+16].xyz;
localData[tbx+j].force.xyz += tempBuffer[get_local_id(0)].xyz + tempBuffer[get_local_id(0)+16].xyz;
}
}
}
......@@ -154,7 +160,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2];
float4 posq2 = localData[atom2].posq;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -172,7 +178,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
local_force[tbx+tj].xyz += delta.xyz;
localData[tbx+tj].force.xyz += delta.xyz;
excl >>= 1;
tj = (tj + 1) & (TILE_SIZE - 1);
}
......@@ -187,7 +193,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
forceBuffers[offset2].xyz += localData[get_local_id(0)].force.xyz;
lasty = y;
}
pos++;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment