"...src/SimTKReference/ReferenceProperDihedralBond.cpp" did not exist on "76e2849ccf0aea4dd118a77e8d7d7e66b1107ab0"
Commit 7b3a8266 authored by Peter Eastman's avatar Peter Eastman
Browse files

Reduced bank conflicts in nonbonded kernel

parent cd6cb66a
...@@ -282,7 +282,7 @@ void OpenCLNonbondedUtilities::computeInteractions() { ...@@ -282,7 +282,7 @@ void OpenCLNonbondedUtilities::computeInteractions() {
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions) const { cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions) const {
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source; replacements["COMPUTE_INTERACTION"] = source;
int localDataSize = 2*sizeof(cl_float4); int localDataSize = 7*sizeof(cl_float);
stringstream localData; stringstream localData;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
localData << params[i].getType() << " " << params[i].getName() << ";\n"; localData << params[i].getType() << " " << params[i].getName() << ";\n";
......
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct { typedef struct {
float4 posq; float x, y, z;
float4 force; float q;
float fx, fy, fz;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
} AtomData; } AtomData;
...@@ -42,7 +43,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -42,7 +43,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
localData[get_local_id(0)].posq = posq1; localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE; unsigned int xi = x/TILE_SIZE;
...@@ -55,7 +59,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -55,7 +59,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+j; int atom2 = baseLocalAtom+j;
float4 posq2 = localData[atom2].posq; float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -95,10 +99,16 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -95,10 +99,16 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y + tgx;
localData[get_local_id(0)].posq = posq[j]; float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
localData[get_local_id(0)].force = 0.0f; localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
...@@ -117,7 +127,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -117,7 +127,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+tj; int atom2 = baseLocalAtom+tj;
float4 posq2 = localData[atom2].posq; float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -135,7 +145,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -135,7 +145,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
localData[baseLocalAtom+tj+forceBufferOffset].force.xyz += delta.xyz; localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
excl >>= 1; excl >>= 1;
tj = (tj+1)%(TILE_SIZE/2); tj = (tj+1)%(TILE_SIZE/2);
...@@ -155,7 +167,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -155,7 +167,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+localData[get_local_id(0)].force.xyz+localData[get_local_id(0)+TILE_SIZE].force.xyz; float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f);
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+sum.xyz;
} }
lasty = y; lasty = y;
} }
......
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct { typedef struct {
float4 posq; float x, y, z;
float4 force; float q;
float fx, fy, fz;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
} AtomData; } AtomData;
...@@ -43,7 +44,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -43,7 +44,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
localData[get_local_id(0)].posq = posq1; localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE; unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
...@@ -55,7 +59,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -55,7 +59,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = localData[atom2].posq; float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -89,10 +93,16 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -89,10 +93,16 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y + tgx;
localData[get_local_id(0)].posq = posq[j]; float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
localData[get_local_id(0)].force = 0.0f; localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = interactionFlags[pos];
if (!hasExclusions && flags != 0xFFFFFFFF && flags == 0) { if (!hasExclusions && flags != 0xFFFFFFFF && flags == 0) {
...@@ -106,7 +116,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -106,7 +116,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
bool isExcluded = false; bool isExcluded = false;
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = localData[atom2].posq; float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -136,8 +146,11 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -136,8 +146,11 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+4].xyz; tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+4].xyz;
if (tgx % 16 == 0) if (tgx % 16 == 0)
tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+8].xyz; tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+8].xyz;
if (tgx == 0) if (tgx == 0) {
localData[tbx+j].force.xyz += tempBuffer[get_local_id(0)].xyz + tempBuffer[get_local_id(0)+16].xyz; localData[tbx+j].fx += tempBuffer[get_local_id(0)].x + tempBuffer[get_local_id(0)+16].x;
localData[tbx+j].fy += tempBuffer[get_local_id(0)].y + tempBuffer[get_local_id(0)+16].y;
localData[tbx+j].fz += tempBuffer[get_local_id(0)].z + tempBuffer[get_local_id(0)+16].z;
}
} }
} }
} }
...@@ -160,7 +173,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -160,7 +173,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
float4 posq2 = localData[atom2].posq; float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -178,7 +191,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -178,7 +191,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
localData[tbx+tj].force.xyz += delta.xyz; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
excl >>= 1; excl >>= 1;
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
...@@ -193,7 +208,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -193,7 +208,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += localData[get_local_id(0)].force.xyz; forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
lasty = y; lasty = y;
} }
pos++; pos++;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment