Commit f11d445b authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to KirkwoodEDiff kernel

parent 21ab511a
...@@ -63,14 +63,6 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( ...@@ -63,14 +63,6 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
float4 jCoord;
float jDipole[3];
float jQuadrupole[9];
float jInducedDipole[3];
float jInducedDipolePolar[3];
float jInducedDipoleS[3];
float jInducedDipolePolarS[3];
float totalEnergy = 0.0f; float totalEnergy = 0.0f;
float tinker_f = (cAmoebaSim.electric/cAmoebaSim.dielec); float tinker_f = (cAmoebaSim.electric/cAmoebaSim.dielec);
...@@ -81,9 +73,6 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( ...@@ -81,9 +73,6 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
unsigned int y; unsigned int y;
bool bExclusionFlag; bool bExclusionFlag;
float forceSum[3];
float torqueSum[3];
float force[3]; float force[3];
float torqueI[3]; float torqueI[3];
float torqueJ[3]; float torqueJ[3];
...@@ -100,15 +89,20 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( ...@@ -100,15 +89,20 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
KirkwoodEDiffParticle* psA = &sA[tbx]; KirkwoodEDiffParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx; unsigned int atomI = x + tgx;
float4 iCoord = atomCoord[atomI]; KirkwoodEDiffParticle localParticle;
loadKirkwoodEDiffShared(&localParticle, atomI,
atomCoord,
labFrameDipole, labFrameQuadrupole,
inducedDipole, inducedDipolePolar,
inducedDipoleS, inducedDipolePolarS );
forceSum[0] = 0.0f; localParticle.force[0] = 0.0f;
forceSum[1] = 0.0f; localParticle.force[1] = 0.0f;
forceSum[2] = 0.0f; localParticle.force[2] = 0.0f;
torqueSum[0] = 0.0f; localParticle.torque[0] = 0.0f;
torqueSum[1] = 0.0f; localParticle.torque[1] = 0.0f;
torqueSum[2] = 0.0f; localParticle.torque[2] = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency if (x == y) // Handle diagonals uniquely at 50% efficiency
{ {
...@@ -135,22 +129,8 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( ...@@ -135,22 +129,8 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
unsigned int atomJ = (y + j); unsigned int atomJ = (y + j);
// load coords, charge, ...
calculateKirkwoodEDiffPairIxn_kernel( localParticle, psA[j],
loadKirkwoodEDiffData( &(psA[j]), &jCoord,
jDipole, jQuadrupole,
jInducedDipole, jInducedDipolePolar,
jInducedDipoleS, jInducedDipolePolarS );
calculateKirkwoodEDiffPairIxn_kernel( iCoord, jCoord,
cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[j].damp,
cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[j].thole,
&(labFrameDipole[3*atomI]), jDipole,
&(labFrameQuadrupole[9*atomI]), jQuadrupole,
&(inducedDipole[3*atomI]), jInducedDipole,
&(inducedDipolePolar[3*atomI]), jInducedDipolePolar,
&(inducedDipoleS[3*atomI]), jInducedDipoleS,
&(inducedDipolePolarS[3*atomI]), jInducedDipolePolarS,
pScale, dScale, pScale, dScale,
&energy, force, torqueI, torqueJ &energy, force, torqueI, torqueJ
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -162,9 +142,9 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( ...@@ -162,9 +142,9 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
// torques include i == j contribution // torques include i == j contribution
torqueSum[0] += mask ? torqueI[0] : 0.0f; localParticle.torque[0] += mask ? torqueI[0] : 0.0f;
torqueSum[1] += mask ? torqueI[1] : 0.0f; localParticle.torque[1] += mask ? torqueI[1] : 0.0f;
torqueSum[2] += mask ? torqueI[2] : 0.0f; localParticle.torque[2] += mask ? torqueI[2] : 0.0f;
totalEnergy += mask ? 0.5f*energy : 0.0f; totalEnergy += mask ? 0.5f*energy : 0.0f;
...@@ -172,9 +152,9 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)( ...@@ -172,9 +152,9 @@ void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
mask = (atomI == atomJ) ? 0 : mask; mask = (atomI == atomJ) ? 0 : mask;
forceSum[0] += mask ? force[0] : 0.0f; localParticle.force[0] += mask ? force[0] : 0.0f;
forceSum[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
forceSum[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -210,27 +190,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -210,27 +190,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int atomJ = (y + j); unsigned int atomJ = (y + j);
// load coords, charge, ...
loadKirkwoodEDiffData( &(psA[j]), &jCoord,
jDipole, jQuadrupole,
jInducedDipole, jInducedDipolePolar,
jInducedDipoleS, jInducedDipolePolarS );
float pScale; float pScale;
float dScale; float dScale;
getMaskedDScaleFactor( j, dScaleMask, &dScale ); getMaskedDScaleFactor( j, dScaleMask, &dScale );
getMaskedPScaleFactor( j, pScaleMask, &pScale ); getMaskedPScaleFactor( j, pScaleMask, &pScale );
calculateKirkwoodEDiffPairIxn_kernel( iCoord, jCoord, calculateKirkwoodEDiffPairIxn_kernel( localParticle, psA[j],
cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[j].damp,
cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[j].thole,
&(labFrameDipole[3*atomI]), jDipole,
&(labFrameQuadrupole[9*atomI]), jQuadrupole,
&(inducedDipole[3*atomI]), jInducedDipole,
&(inducedDipolePolar[3*atomI]), jInducedDipolePolar,
&(inducedDipoleS[3*atomI]), jInducedDipoleS,
&(inducedDipolePolarS[3*atomI]), jInducedDipolePolarS,
pScale, dScale, pScale, dScale,
&energy, force, torqueI, torqueJ &energy, force, torqueI, torqueJ
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -242,17 +207,17 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -242,17 +207,17 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// torques include i == j contribution // torques include i == j contribution
torqueSum[0] += mask ? torqueI[0] : 0.0f; localParticle.torque[0] += mask ? torqueI[0] : 0.0f;
torqueSum[1] += mask ? torqueI[1] : 0.0f; localParticle.torque[1] += mask ? torqueI[1] : 0.0f;
torqueSum[2] += mask ? torqueI[2] : 0.0f; localParticle.torque[2] += mask ? torqueI[2] : 0.0f;
totalEnergy += mask ? 0.5f*energy : 0.0f; totalEnergy += mask ? 0.5f*energy : 0.0f;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole // add to field at atomI the field due atomJ's charge/dipole/quadrupole
forceSum[0] += mask ? force[0] : 0.0f; localParticle.force[0] += mask ? force[0] : 0.0f;
forceSum[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
forceSum[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -280,19 +245,19 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -280,19 +245,19 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// scale and write results // scale and write results
scale3dArray( tinker_f, forceSum ); scale3dArray( tinker_f, localParticle.force );
scale3dArray( tinker_f, torqueSum ); scale3dArray( tinker_f, localParticle.torque );
#ifdef USE_OUTPUT_BUFFER_PER_WARP #ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, forceSum, outputForce ); load3dArrayBufferPerWarp( offset, localParticle.force, outputForce );
load3dArrayBufferPerWarp( offset, torqueSum, outputTorque ); load3dArrayBufferPerWarp( offset, localParticle.torque, outputTorque );
#else #else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, forceSum, outputForce ); load3dArray( offset, localParticle.force, outputForce );
load3dArray( offset, torqueSum, outputTorque ); load3dArray( offset, localParticle.torque, outputTorque );
#endif #endif
...@@ -329,22 +294,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -329,22 +294,7 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int atomJ = y + tj; unsigned int atomJ = y + tj;
// load coords, charge, ... calculateKirkwoodEDiffPairIxn_kernel( localParticle, psA[tj],
loadKirkwoodEDiffData( &(psA[tj]), &jCoord,
jDipole, jQuadrupole,
jInducedDipole, jInducedDipolePolar,
jInducedDipoleS, jInducedDipolePolarS );
calculateKirkwoodEDiffPairIxn_kernel( iCoord, jCoord,
cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[tj].damp,
cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[tj].thole,
&(labFrameDipole[3*atomI]), jDipole,
&(labFrameQuadrupole[9*atomI]), jQuadrupole,
&(inducedDipole[3*atomI]), jInducedDipole,
&(inducedDipolePolar[3*atomI]), jInducedDipolePolar,
&(inducedDipoleS[3*atomI]), jInducedDipoleS,
&(inducedDipolePolarS[3*atomI]), jInducedDipolePolarS,
pScale, dScale, pScale, dScale,
&energy, force, &energy, force,
torqueI, torqueJ torqueI, torqueJ
...@@ -358,13 +308,13 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -358,13 +308,13 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// add force and torque to atom I due atom J // add force and torque to atom I due atom J
forceSum[0] += mask ? force[0] : 0.0f; localParticle.force[0] += mask ? force[0] : 0.0f;
forceSum[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
forceSum[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
torqueSum[0] += mask ? torqueI[0] : 0.0f; localParticle.torque[0] += mask ? torqueI[0] : 0.0f;
torqueSum[1] += mask ? torqueI[1] : 0.0f; localParticle.torque[1] += mask ? torqueI[1] : 0.0f;
torqueSum[2] += mask ? torqueI[2] : 0.0f; localParticle.torque[2] += mask ? torqueI[2] : 0.0f;
totalEnergy += mask ? energy : 0.0f; totalEnergy += mask ? energy : 0.0f;
...@@ -413,27 +363,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -413,27 +363,12 @@ if( atomI == targetAtom || atomJ == targetAtom ){
{ {
unsigned int atomJ = y + tj; unsigned int atomJ = y + tj;
// load coords, charge, ...
loadKirkwoodEDiffData( &(psA[tj]), &jCoord,
jDipole, jQuadrupole,
jInducedDipole, jInducedDipolePolar,
jInducedDipoleS, jInducedDipolePolarS );
float dScale; float dScale;
float pScale; float pScale;
getMaskedDScaleFactor( tj, dScaleMask, &dScale ); getMaskedDScaleFactor( tj, dScaleMask, &dScale );
getMaskedPScaleFactor( tj, pScaleMask, &pScale ); getMaskedPScaleFactor( tj, pScaleMask, &pScale );
calculateKirkwoodEDiffPairIxn_kernel( iCoord, jCoord, calculateKirkwoodEDiffPairIxn_kernel( localParticle, psA[tj],
cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[tj].damp,
cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[tj].thole,
&(labFrameDipole[3*atomI]), jDipole,
&(labFrameQuadrupole[9*atomI]), jQuadrupole,
&(inducedDipole[3*atomI]), jInducedDipole,
&(inducedDipolePolar[3*atomI]), jInducedDipolePolar,
&(inducedDipoleS[3*atomI]), jInducedDipoleS,
&(inducedDipolePolarS[3*atomI]), jInducedDipolePolarS,
pScale, dScale, pScale, dScale,
&energy, force, &energy, force,
torqueI, torqueJ torqueI, torqueJ
...@@ -447,13 +382,13 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -447,13 +382,13 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// add force and torque to atom I due atom J // add force and torque to atom I due atom J
forceSum[0] += mask ? force[0] : 0.0f; localParticle.force[0] += mask ? force[0] : 0.0f;
forceSum[1] += mask ? force[1] : 0.0f; localParticle.force[1] += mask ? force[1] : 0.0f;
forceSum[2] += mask ? force[2] : 0.0f; localParticle.force[2] += mask ? force[2] : 0.0f;
torqueSum[0] += mask ? torqueI[0] : 0.0f; localParticle.torque[0] += mask ? torqueI[0] : 0.0f;
torqueSum[1] += mask ? torqueI[1] : 0.0f; localParticle.torque[1] += mask ? torqueI[1] : 0.0f;
torqueSum[2] += mask ? torqueI[2] : 0.0f; localParticle.torque[2] += mask ? torqueI[2] : 0.0f;
totalEnergy += mask ? energy : 0.0f; totalEnergy += mask ? energy : 0.0f;
...@@ -492,8 +427,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -492,8 +427,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
// scale and write results // scale and write results
scale3dArray( tinker_f, forceSum ); scale3dArray( tinker_f, localParticle.force );
scale3dArray( tinker_f, torqueSum ); scale3dArray( tinker_f, localParticle.torque );
scale3dArray( tinker_f, sA[threadIdx.x].force ); scale3dArray( tinker_f, sA[threadIdx.x].force );
scale3dArray( tinker_f, sA[threadIdx.x].torque ); scale3dArray( tinker_f, sA[threadIdx.x].torque );
...@@ -502,8 +437,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -502,8 +437,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, forceSum, outputForce ); load3dArrayBufferPerWarp( offset, localParticle.force, outputForce );
load3dArrayBufferPerWarp( offset, torqueSum, outputTorque ); load3dArrayBufferPerWarp( offset, localParticle.torque, outputTorque );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms); offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
...@@ -512,8 +447,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){ ...@@ -512,8 +447,8 @@ if( atomI == targetAtom || atomJ == targetAtom ){
#else #else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, forceSum, outputForce ); load3dArray( offset, localParticle.force, outputForce );
load3dArray( offset, torqueSum, outputTorque ); load3dArray( offset, localParticle.torque, outputTorque );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms); offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment