Commit d2a5b3bb authored by Peter Eastman's avatar Peter Eastman
Browse files

Minor performance optimizations

parent 592dc5a9
...@@ -25,9 +25,9 @@ if (!isExcluded || needCorrection) { ...@@ -25,9 +25,9 @@ if (!isExcluded || needCorrection) {
float sig2 = invR*sig; float sig2 = invR*sig;
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2*sig2*sig2; float sig6 = sig2*sig2*sig2;
float eps = sigmaEpsilon1.y*sigmaEpsilon2.y; float epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = eps*(12.0f*sig6 - 6.0f)*sig6 + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI); tempForce = epssig6*(12.0f*sig6 - 6.0f) + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += eps*(sig6 - 1.0f)*sig6 + prefactor*erfcAlphaR; tempEnergy += epssig6*(sig6 - 1.0f) + prefactor*erfcAlphaR;
#else #else
tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI); tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR; tempEnergy += prefactor*erfcAlphaR;
...@@ -49,9 +49,9 @@ if (!isExcluded || needCorrection) { ...@@ -49,9 +49,9 @@ if (!isExcluded || needCorrection) {
float sig2 = invR*sig; float sig2 = invR*sig;
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2*sig2*sig2; float sig6 = sig2*sig2*sig2;
float eps = sigmaEpsilon1.y*sigmaEpsilon2.y; float epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = eps*(12.0f*sig6 - 6.0f)*sig6; tempForce = epssig6*(12.0f*sig6 - 6.0f);
tempEnergy += select(0.0f, eps*(sig6 - 1.0f)*sig6, includeInteraction); tempEnergy += select(0.0f, epssig6*(sig6 - 1.0f), includeInteraction);
#endif #endif
#if HAS_COULOMB #if HAS_COULOMB
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
......
...@@ -71,52 +71,38 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -71,52 +71,38 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE) for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = temp[i].y; sum[i] = temp[i].y;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int numValid = sum[BUFFER_SIZE-1];
barrier(CLK_LOCAL_MEM_FENCE);
// Compact the buffer. // Compact the buffer.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE) for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
if (valid[i]) { if (valid[i]) {
temp[sum[i]-1] = buffer[i]; temp[sum[i]-1] = buffer[i];
sum[i] = valid[i];
valid[i] = false; valid[i] = false;
buffer[i] = (ushort2) 1;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int numValid = sum[BUFFER_SIZE-1];
#ifndef WARPS_ARE_ATOMIC
// Filter the list of tiles by comparing the distance from each atom to the other bounding box. // Filter the list of tiles by comparing the distance from each atom to the other bounding box.
// We only do this if we aren't already optimizing the computation using flags.
int tile;
int index = get_local_id(0)&(TILE_SIZE-1); int index = get_local_id(0)&(TILE_SIZE-1);
int group = get_local_id(0)/TILE_SIZE; int group = get_local_id(0)/TILE_SIZE;
__local int* flag = sum;
int lasty = -1;
float4 center, boxSize, pos; float4 center, boxSize, pos;
for (tile = 0; tile < numValid; ) { for (int tile = 0; tile < numValid; tile++) {
int x = temp[tile].x; int x = temp[tile].x;
int y = temp[tile].y; int y = temp[tile].y;
if (x == y) { if (x == y)
tile++;
continue; continue;
}
if (index == 0)
flag[group] = true;
barrier(CLK_LOCAL_MEM_FENCE);
// Load an atom position and the bounding box the other block. // Load an atom position and the bounding box the other block.
if (group == 0) { center = blockCenter[(group == 0 ? x : y)];
center = blockCenter[x]; boxSize = blockBoundingBox[(group == 0 ? x : y)];
boxSize = blockBoundingBox[x]; pos = posq[(group == 0 ? y : x)*TILE_SIZE+index];
if (y != lasty)
pos = posq[y*TILE_SIZE+index];
}
else {
if (y != lasty) {
center = blockCenter[y];
boxSize = blockBoundingBox[y];
}
pos = posq[x*TILE_SIZE+index];
}
lasty = y;
// Find the distance of the atom from the bounding box. // Find the distance of the atom from the bounding box.
...@@ -127,6 +113,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -127,6 +113,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
delta = max((float4) 0.0f, fabs(delta)-boxSize); delta = max((float4) 0.0f, fabs(delta)-boxSize);
__local ushort* flag = (__local ushort*) &buffer[tile];
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared)
flag[group] = false; flag[group] = false;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -136,11 +123,11 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -136,11 +123,11 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
numValid--; numValid--;
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
temp[tile] = temp[numValid]; temp[tile] = temp[numValid];
tile--;
} }
else
tile++;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
#endif
// Store it to global memory. // Store it to global memory.
......
...@@ -40,7 +40,6 @@ __kernel void computeNonbonded( ...@@ -40,7 +40,6 @@ __kernel void computeNonbonded(
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps; unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange; __local int2* reservedBlocks = (__local int2*) exclusionRange;
...@@ -141,15 +140,13 @@ __kernel void computeNonbonded( ...@@ -141,15 +140,13 @@ __kernel void computeNonbonded(
// This is an off-diagonal tile. // This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) { unsigned int j = y*TILE_SIZE + tgx;
unsigned int j = y*TILE_SIZE + tgx; float4 tempPosq = posq[j];
float4 tempPosq = posq[j]; localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].x = tempPosq.x; localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].y = tempPosq.y; localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].z = tempPosq.z; localData[localAtomIndex].q = tempPosq.w;
localData[localAtomIndex].q = tempPosq.w; LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].fx = 0.0f; localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f; localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f; localData[localAtomIndex].fz = 0.0f;
...@@ -353,7 +350,6 @@ __kernel void computeNonbonded( ...@@ -353,7 +350,6 @@ __kernel void computeNonbonded(
} }
} }
#endif #endif
lasty = y;
pos++; pos++;
} while (pos < end); } while (pos < end);
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment