Commit 7e9cf94b authored by Peter Eastman's avatar Peter Eastman
Browse files

Fixed an error on AMD's OpenCL implementation

parent e43f14bf
......@@ -56,7 +56,6 @@ void OpenCLCalcForcesAndEnergyKernel::beginForceComputation(ContextImpl& context
void OpenCLCalcForcesAndEnergyKernel::finishForceComputation(ContextImpl& context) {
cl.getNonbondedUtilities().computeInteractions();
cl.reduceBuffer(cl.getForceBuffers(), cl.getNumForceBuffers());
cl.getNonbondedUtilities().prepareInteractions();
}
void OpenCLCalcForcesAndEnergyKernel::beginEnergyComputation(ContextImpl& context) {
......
const unsigned int TileSize = 32;
#define TILE_SIZE 32
/**
* Compute nonbonded interactions.
......@@ -23,12 +23,12 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TileSize;
unsigned int baseLocalAtom = (get_local_id(0) < TileSize ? 0 : TileSize/2);
unsigned int tgx = get_local_id(0) & (TileSize-1);
unsigned int forceBufferOffset = (tgx < TileSize/2 ? 0 : TileSize);
x = (x>>17)*TILE_SIZE;
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
......@@ -39,12 +39,12 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TileSize/2; j++) {
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
......@@ -72,92 +72,84 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
// Sum the forces and write results.
if (get_local_id(0) >= TileSize)
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = force;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TileSize) {
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz += force.xyz+tempBuffer[get_local_id(0)+TileSize].xyz;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
}
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TileSize) {
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos];
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TileSize - tgx));
excl >>= baseLocalAtom;
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
excl >>= baseLocalAtom;
#endif
unsigned int tj = tgx%(TileSize/2);
for (unsigned int j = 0; j < TileSize/2; j++) {
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f/r;
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
local_force[baseLocalAtom+tj+forceBufferOffset].xyz += delta.xyz;
barrier(CLK_LOCAL_MEM_FENCE);
excl >>= 1;
tj = (tj+1)%(TileSize/2);
}
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f/r;
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
local_force[baseLocalAtom+tj+forceBufferOffset].xyz += delta.xyz;
barrier(CLK_LOCAL_MEM_FENCE);
excl >>= 1;
tj = (tj+1)%(TILE_SIZE/2);
}
// Sum the forces and write results.
if (get_local_id(0) >= TileSize)
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = force;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TileSize) {
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz+tempBuffer[get_local_id(0)+TileSize].xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TileSize].xyz;
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
}
lasty = y;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment