Commit 3d1b2186 authored by peastman's avatar peastman
Browse files

Further optimizations to NonbondedForce

parent 86d09347
{
#if USE_EWALD
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
unsigned int includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
......@@ -44,16 +45,14 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
}
#endif
tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += ljEnergy + prefactor*erfcAlphaR;
tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
#else
tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR;
tempEnergy += includeInteraction ? prefactor*erfcAlphaR : 0;
#endif
}
dEdR += tempForce*invR*invR;
}
dEdR += includeInteraction ? tempForce*invR*invR : 0;
#else
{
#ifdef USE_CUTOFF
unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else
......@@ -91,5 +90,5 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
#endif
#endif
dEdR += includeInteraction ? tempForce*invR*invR : 0;
}
#endif
}
......@@ -228,9 +228,6 @@ extern "C" __global__ void computeNonbonded(
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
......@@ -276,9 +273,6 @@ extern "C" __global__ void computeNonbonded(
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
......@@ -431,7 +425,6 @@ extern "C" __global__ void computeNonbonded(
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
......@@ -477,7 +470,6 @@ extern "C" __global__ void computeNonbonded(
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
}
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
......@@ -503,9 +495,6 @@ extern "C" __global__ void computeNonbonded(
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
......@@ -551,9 +540,6 @@ extern "C" __global__ void computeNonbonded(
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
......
......@@ -633,7 +633,7 @@ private:
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionCoefficient, alpha;
bool hasCoulomb, hasLJ;
bool hasCoulomb, hasLJ, usePmeQueue;
static const int PmeOrder = 5;
};
......
......@@ -1609,12 +1609,16 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (usePmeQueue) {
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cl.addPreComputation(new SyncQueuePreComputation(cl, pmeQueue, recipForceGroup));
cl.addPostComputation(new SyncQueuePostComputation(cl, pmeSyncEvent, recipForceGroup));
}
// Initialize the b-spline moduli.
......@@ -1794,6 +1798,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
}
if (pmeGrid != NULL && includeReciprocal) {
if (usePmeQueue)
cl.setQueue(pmeQueue);
setPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 4);
setInvPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 5);
......@@ -1837,9 +1842,11 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeInterpolateForceKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
else
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
if (usePmeQueue) {
pmeQueue.enqueueMarker(&pmeSyncEvent);
cl.restoreDefaultQueue();
}
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) {
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
......
......@@ -573,6 +573,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric)
defines["USE_SYMMETRIC"] = "1";
if (useCutoff && context.getSIMDWidth() < 32)
defines["PRUNE_BY_CUTOFF"] = "1";
defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["CUTOFF"] = context.doubleToString(cutoff);
......
{
#ifdef USE_DOUBLE_PRECISION
unsigned long includeInteraction;
#else
unsigned int includeInteraction;
#endif
#if USE_EWALD
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
......@@ -44,21 +50,14 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
}
#endif
tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += ljEnergy + prefactor*erfcAlphaR;
tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
#else
tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR;
tempEnergy += select((real) 0, prefactor*erfcAlphaR, includeInteraction);
#endif
}
dEdR += tempForce*invR*invR;
}
#else
{
#ifdef USE_DOUBLE_PRECISION
unsigned long includeInteraction;
dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
#else
unsigned int includeInteraction;
#endif
#ifdef USE_CUTOFF
includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else
......@@ -97,5 +96,5 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
#endif
#endif
dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
}
#endif
}
......@@ -124,7 +124,7 @@ __kernel void computeNonbonded(
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
#ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
......@@ -155,7 +155,7 @@ __kernel void computeNonbonded(
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
#ifdef PRUNE_BY_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
......@@ -295,7 +295,9 @@ __kernel void computeNonbonded(
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
......@@ -324,7 +326,9 @@ __kernel void computeNonbonded(
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef PRUNE_BY_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
......@@ -343,7 +347,7 @@ __kernel void computeNonbonded(
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
#ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
......@@ -374,7 +378,7 @@ __kernel void computeNonbonded(
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
#ifdef PRUNE_BY_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment