Commit bd7cce70 authored by Peter Eastman's avatar Peter Eastman
Browse files

Minor optimizations to OpenCL

parent 803af89b
...@@ -83,7 +83,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste ...@@ -83,7 +83,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
numAtoms = numParticles; numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
numThreadBlocks = 4*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numThreadBlocks = 6*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
nonbonded = new OpenCLNonbondedUtilities(*this); nonbonded = new OpenCLNonbondedUtilities(*this);
posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true); posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true);
velm = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "velm", true); velm = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "velm", true);
......
...@@ -3,13 +3,13 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUA ...@@ -3,13 +3,13 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUA
#else #else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif #endif
float invRSquared = 1.0f/r2; float invRSquared = RECIP(r2);
float rScaledRadiusJ = r+obcParams2.y; float rScaledRadiusJ = r+obcParams2.y;
float rScaledRadiusI = r+obcParams1.y; float rScaledRadiusI = r+obcParams1.y;
float l_ijJ = 1.0f/max(obcParams1.x, fabs(r-obcParams2.y)); float l_ijJ = RECIP(max(obcParams1.x, fabs(r-obcParams2.y)));
float l_ijI = 1.0f/max(obcParams2.x, fabs(r-obcParams1.y)); float l_ijI = RECIP(max(obcParams2.x, fabs(r-obcParams1.y)));
float u_ijJ = 1.0f/rScaledRadiusJ; float u_ijJ = RECIP(rScaledRadiusJ);
float u_ijI = 1.0f/rScaledRadiusI; float u_ijI = RECIP(rScaledRadiusI);
float l_ij2J = l_ijJ*l_ijJ; float l_ij2J = l_ijJ*l_ijJ;
float l_ij2I = l_ijI*l_ijI; float l_ij2I = l_ijI*l_ijI;
float u_ij2J = u_ijJ*u_ijJ; float u_ij2J = u_ijJ*u_ijJ;
...@@ -22,12 +22,8 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { ...@@ -22,12 +22,8 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
float t3I = t2I*invR; float t3I = t2I*invR;
t1J *= invR; t1J *= invR;
t1I *= invR; t1I *= invR;
if (obcParams1.x < rScaledRadiusJ) { float term1 = 0.125f*(1.0f+obcParams2.y*obcParams2.y*invRSquared)*t3J + 0.25f*t1J*invRSquared;
float term = 0.125f*(1.0f+obcParams2.y*obcParams2.y*invRSquared)*t3J + 0.25f*t1J*invRSquared; float term2 = 0.125f*(1.0f+obcParams1.y*obcParams1.y*invRSquared)*t3I + 0.25f*t1I*invRSquared;
dEdR += bornForce1*term; dEdR += (obcParams1.x < rScaledRadiusJ ? bornForce1*term1 : 0.0f);
} dEdR += (obcParams2.x < rScaledRadiusJ ? bornForce2*term2 : 0.0f);
if (obcParams2.x < rScaledRadiusJ) {
float term = 0.125f*(1.0f+obcParams1.y*obcParams1.y*invRSquared)*t3I + 0.25f*t1I*invRSquared;
dEdR += bornForce2*term;
}
} }
...@@ -71,15 +71,15 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -71,15 +71,15 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius); float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = 1.0f/rScaledRadiusJ; float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij); bornSum += 2.0f*RECIP(params1.x-l_ij);
} }
} }
} }
...@@ -139,27 +139,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -139,27 +139,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius); float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = 1.0f/rScaledRadiusJ; float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij); bornSum += 2.0f*RECIP(params1.x-l_ij);
} }
float rScaledRadiusI = r+params1.y; float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) { if (params2.x < rScaledRadiusI) {
float l_ij = 1.0f/max(params2.x, fabs(r-params1.y)); float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = 1.0f/rScaledRadiusI; float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = log(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij); term += 2.0f*RECIP(params2.x-l_ij);
localData[baseLocalAtom+tj+forceBufferOffset].bornSum += term; localData[baseLocalAtom+tj+forceBufferOffset].bornSum += term;
} }
} }
......
...@@ -71,15 +71,15 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -71,15 +71,15 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius); float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = 1.0f/rScaledRadiusJ; float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij); bornSum += 2.0f*RECIP(params1.x-l_ij);
} }
} }
} }
...@@ -136,27 +136,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -136,27 +136,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius); float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = 1.0f/rScaledRadiusJ; float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij); bornSum += 2.0f*RECIP(params1.x-l_ij);
} }
float rScaledRadiusI = r+params1.y; float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) { if (params2.x < rScaledRadiusI) {
float l_ij = 1.0f/max(params2.x, fabs(r-params1.y)); float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = 1.0f/rScaledRadiusI; float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = log(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij); term += 2.0f*RECIP(params2.x-l_ij);
tempBuffer[get_local_id(0)] = term; tempBuffer[get_local_id(0)] = term;
} }
} }
...@@ -204,27 +204,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -204,27 +204,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius); float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = 1.0f/rScaledRadiusJ; float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij); bornSum += 2.0f*RECIP(params1.x-l_ij);
} }
float rScaledRadiusI = r+params1.y; float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) { if (params2.x < rScaledRadiusI) {
float l_ij = 1.0f/max(params2.x, fabs(r-params1.y)); float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = 1.0f/rScaledRadiusI; float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = log(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij); term += 2.0f*RECIP(params2.x-l_ij);
localData[tbx+tj].bornSum += term; localData[tbx+tj].bornSum += term;
} }
} }
......
...@@ -69,7 +69,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -69,7 +69,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f/r; float invR = RECIP(r);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+j; atom2 = y+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
......
...@@ -15,9 +15,9 @@ __kernel void applySettle(int numClusters, float tol, __global float4* oldPos, _ ...@@ -15,9 +15,9 @@ __kernel void applySettle(int numClusters, float tol, __global float4* oldPos, _
float4 xp1 = posDelta[atoms.y]; float4 xp1 = posDelta[atoms.y];
float4 apos2 = oldPos[atoms.z]; float4 apos2 = oldPos[atoms.z];
float4 xp2 = posDelta[atoms.z]; float4 xp2 = posDelta[atoms.z];
float m0 = 1.0f/velm[atoms.x].w; float m0 = RECIP(velm[atoms.x].w);
float m1 = 1.0f/velm[atoms.y].w; float m1 = RECIP(velm[atoms.y].w);
float m2 = 1.0f/velm[atoms.z].w; float m2 = RECIP(velm[atoms.z].w);
// Translate the molecule to the origin to improve numerical precision. // Translate the molecule to the origin to improve numerical precision.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment