Commit 3d1b2186 authored by peastman's avatar peastman
Browse files

Further optimizations to NonbondedForce

parent 86d09347
{
#if USE_EWALD
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
unsigned int includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
......@@ -44,16 +45,14 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
}
#endif
tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += ljEnergy + prefactor*erfcAlphaR;
tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
#else
tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR;
tempEnergy += includeInteraction ? prefactor*erfcAlphaR : 0;
#endif
}
dEdR += tempForce*invR*invR;
}
dEdR += includeInteraction ? tempForce*invR*invR : 0;
#else
{
#ifdef USE_CUTOFF
unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else
......@@ -91,5 +90,5 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
#endif
#endif
dEdR += includeInteraction ? tempForce*invR*invR : 0;
#endif
}
#endif
\ No newline at end of file
......@@ -228,57 +228,51 @@ extern "C" __global__ void computeNonbonded(
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
......@@ -431,53 +425,51 @@ extern "C" __global__ void computeNonbonded(
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
}
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
......@@ -503,57 +495,51 @@ extern "C" __global__ void computeNonbonded(
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
......
......@@ -633,7 +633,7 @@ private:
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionCoefficient, alpha;
bool hasCoulomb, hasLJ;
bool hasCoulomb, hasLJ, usePmeQueue;
static const int PmeOrder = 5;
};
......
......@@ -1609,12 +1609,16 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cl.addPreComputation(new SyncQueuePreComputation(cl, pmeQueue, recipForceGroup));
cl.addPostComputation(new SyncQueuePostComputation(cl, pmeSyncEvent, recipForceGroup));
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (usePmeQueue) {
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cl.addPreComputation(new SyncQueuePreComputation(cl, pmeQueue, recipForceGroup));
cl.addPostComputation(new SyncQueuePostComputation(cl, pmeSyncEvent, recipForceGroup));
}
// Initialize the b-spline moduli.
......@@ -1794,7 +1798,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
}
if (pmeGrid != NULL && includeReciprocal) {
cl.setQueue(pmeQueue);
if (usePmeQueue)
cl.setQueue(pmeQueue);
setPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 4);
setInvPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 5);
cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
......@@ -1837,8 +1842,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeInterpolateForceKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
else
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
pmeQueue.enqueueMarker(&pmeSyncEvent);
cl.restoreDefaultQueue();
if (usePmeQueue) {
pmeQueue.enqueueMarker(&pmeSyncEvent);
cl.restoreDefaultQueue();
}
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) {
......
......@@ -573,6 +573,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric)
defines["USE_SYMMETRIC"] = "1";
if (useCutoff && context.getSIMDWidth() < 32)
defines["PRUNE_BY_CUTOFF"] = "1";
defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["CUTOFF"] = context.doubleToString(cutoff);
......
{
#ifdef USE_DOUBLE_PRECISION
unsigned long includeInteraction;
#else
unsigned int includeInteraction;
#endif
#if USE_EWALD
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
......@@ -44,21 +50,14 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
}
#endif
tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += ljEnergy + prefactor*erfcAlphaR;
tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
#else
tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR;
tempEnergy += select((real) 0, prefactor*erfcAlphaR, includeInteraction);
#endif
}
dEdR += tempForce*invR*invR;
}
#else
{
#ifdef USE_DOUBLE_PRECISION
unsigned long includeInteraction;
dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
#else
unsigned int includeInteraction;
#endif
#ifdef USE_CUTOFF
includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else
......@@ -97,5 +96,5 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
#endif
#endif
dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
#endif
}
#endif
\ No newline at end of file
......@@ -124,7 +124,7 @@ __kernel void computeNonbonded(
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
#ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
......@@ -155,7 +155,7 @@ __kernel void computeNonbonded(
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
#ifdef PRUNE_BY_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
......@@ -295,7 +295,9 @@ __kernel void computeNonbonded(
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
......@@ -324,7 +326,9 @@ __kernel void computeNonbonded(
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef PRUNE_BY_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
......@@ -343,7 +347,7 @@ __kernel void computeNonbonded(
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
#ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
......@@ -374,7 +378,7 @@ __kernel void computeNonbonded(
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
#ifdef PRUNE_BY_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment