Commit bd7cce70 authored by Peter Eastman's avatar Peter Eastman
Browse files

Minor optimizations to OpenCL

parent 803af89b
......@@ -83,7 +83,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
numThreadBlocks = 4*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
numThreadBlocks = 6*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
nonbonded = new OpenCLNonbondedUtilities(*this);
posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true);
velm = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "velm", true);
......
......@@ -3,13 +3,13 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUA
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
float invRSquared = 1.0f/r2;
float invRSquared = RECIP(r2);
float rScaledRadiusJ = r+obcParams2.y;
float rScaledRadiusI = r+obcParams1.y;
float l_ijJ = 1.0f/max(obcParams1.x, fabs(r-obcParams2.y));
float l_ijI = 1.0f/max(obcParams2.x, fabs(r-obcParams1.y));
float u_ijJ = 1.0f/rScaledRadiusJ;
float u_ijI = 1.0f/rScaledRadiusI;
float l_ijJ = RECIP(max(obcParams1.x, fabs(r-obcParams2.y)));
float l_ijI = RECIP(max(obcParams2.x, fabs(r-obcParams1.y)));
float u_ijJ = RECIP(rScaledRadiusJ);
float u_ijI = RECIP(rScaledRadiusI);
float l_ij2J = l_ijJ*l_ijJ;
float l_ij2I = l_ijI*l_ijI;
float u_ij2J = u_ijJ*u_ijJ;
......@@ -22,12 +22,8 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
float t3I = t2I*invR;
t1J *= invR;
t1I *= invR;
if (obcParams1.x < rScaledRadiusJ) {
float term = 0.125f*(1.0f+obcParams2.y*obcParams2.y*invRSquared)*t3J + 0.25f*t1J*invRSquared;
dEdR += bornForce1*term;
}
if (obcParams2.x < rScaledRadiusJ) {
float term = 0.125f*(1.0f+obcParams1.y*obcParams1.y*invRSquared)*t3I + 0.25f*t1I*invRSquared;
dEdR += bornForce2*term;
}
float term1 = 0.125f*(1.0f+obcParams2.y*obcParams2.y*invRSquared)*t3J + 0.25f*t1J*invRSquared;
float term2 = 0.125f*(1.0f+obcParams1.y*obcParams1.y*invRSquared)*t3I + 0.25f*t1I*invRSquared;
dEdR += (obcParams1.x < rScaledRadiusJ ? bornForce1*term1 : 0.0f);
dEdR += (obcParams2.x < rScaledRadiusJ ? bornForce2*term2 : 0.0f);
}
......@@ -71,15 +71,15 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
float u_ij = 1.0f/rScaledRadiusJ;
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij);
bornSum += 2.0f*RECIP(params1.x-l_ij);
}
}
}
......@@ -139,27 +139,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
float u_ij = 1.0f/rScaledRadiusJ;
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij);
bornSum += 2.0f*RECIP(params1.x-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = 1.0f/max(params2.x, fabs(r-params1.y));
float u_ij = 1.0f/rScaledRadiusI;
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij);
term += 2.0f*RECIP(params2.x-l_ij);
localData[baseLocalAtom+tj+forceBufferOffset].bornSum += term;
}
}
......
......@@ -71,15 +71,15 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
float u_ij = 1.0f/rScaledRadiusJ;
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij);
bornSum += 2.0f*RECIP(params1.x-l_ij);
}
}
}
......@@ -136,27 +136,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
float u_ij = 1.0f/rScaledRadiusJ;
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij);
bornSum += 2.0f*RECIP(params1.x-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = 1.0f/max(params2.x, fabs(r-params1.y));
float u_ij = 1.0f/rScaledRadiusI;
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij);
term += 2.0f*RECIP(params2.x-l_ij);
tempBuffer[get_local_id(0)] = term;
}
}
......@@ -204,27 +204,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
float u_ij = 1.0f/rScaledRadiusJ;
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij);
bornSum += 2.0f*RECIP(params1.x-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = 1.0f/max(params2.x, fabs(r-params1.y));
float u_ij = 1.0f/rScaledRadiusI;
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij);
term += 2.0f*RECIP(params2.x-l_ij);
localData[tbx+tj].bornSum += term;
}
}
......
......@@ -69,7 +69,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f/r;
float invR = RECIP(r);
LOAD_ATOM2_PARAMETERS
atom2 = y+j;
#ifdef USE_SYMMETRIC
......
......@@ -15,9 +15,9 @@ __kernel void applySettle(int numClusters, float tol, __global float4* oldPos, _
float4 xp1 = posDelta[atoms.y];
float4 apos2 = oldPos[atoms.z];
float4 xp2 = posDelta[atoms.z];
float m0 = 1.0f/velm[atoms.x].w;
float m1 = 1.0f/velm[atoms.y].w;
float m2 = 1.0f/velm[atoms.z].w;
float m0 = RECIP(velm[atoms.x].w);
float m1 = RECIP(velm[atoms.y].w);
float m2 = RECIP(velm[atoms.z].w);
// Translate the molecule to the origin to improve numerical precision.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment