"platforms/reference/vscode:/vscode.git/clone" did not exist on "61d15a2a12346e5d41503124d6fe8b2f98a155f0"
Commit 2e451b9d authored by Peter Eastman's avatar Peter Eastman
Browse files

Deleted the old CUDA platform

parent 352e2fc7
static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnF1, _kernel)( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float4 delta, float4 bn, float bn5, float forceFactor,
#ifdef APPLY_SCALE
const float* scalingFactors,
#endif
float force[3], float* energy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
#ifdef APPLY_SCALE
float rr1 = delta.w;
#endif
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
//float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
// float qk9 = atomJ.labFrameQuadrupole[5];
float qk9 = -(atomJ.labFrameQuadrupole[0] + atomJ.labFrameQuadrupole[3]);
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
#ifdef APPLY_SCALE
float offset = 1.0f-scalingFactors[MScaleIndex];
float rr3 = rr1*rr1*rr1;
float gf4 = 2.0f*(bn2 - 3.0f*offset*rr3*rr1*rr1);
#else
float gf4 = 2.0f*bn2;
#endif
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qkdi1 = qk1*di1 + qk2*di2 + qk3*di3;
float ftm21 = gf4*(qkdi1-qidk1);
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qkdi2 = qk2*di1 + qk5*di2 + qk6*di3;
float ftm22 = gf4*(qkdi2-qidk2);
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float qkdi3 = qk3*di1 + qk6*di2 + qk9*di3;
float ftm23 = gf4*(qkdi3-qidk3);
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
#ifdef APPLY_SCALE
float gf7 = 4.0f*(bn3 - 15.0f*offset*rr3*rr3*rr1);
#else
float gf7 = 4.0f*bn3;
#endif
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qkqir1 = qk1*qir1 + qk2*qir2 + qk3*qir3;
ftm21 += gf7*(qiqkr1+qkqir1);
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qkqir2 = qk2*qir1 + qk5*qir2 + qk6*qir3;
ftm22 += gf7*(qiqkr2+qkqir2);
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float qkqir3 = qk3*qir1 + qk6*qir2 + qk9*qir3;
ftm23 += gf7*(qiqkr3+qkqir3);
// calculate the scalar products for permanent components
float gl6 = di1*dk1 + di2*dk2 + di3*dk3;
float gl7 = 2.0f*( qir1*dk1 + qir2*dk2 + qir3*dk3 - ( qkr1*di1 + qkr2*di2 + qkr3*di3 ) );
float gl5 = -4.0f*(qir1*qkr1 + qir2*qkr2 + qir3*qkr3);
float gl8 = 2.0f*(qi1*qk1 + qi2*qk2 + qi3*qk3 + qi2*qk2 + qi5*qk5 + qi6*qk6 + qi3*qk3 + qi6*qk6 + qi9*qk9 );
float sc3 = di1*xr + di2*yr + di3*zr;
float sc5 = qir1*xr + qir2*yr + qir3*zr;
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
float gl0 = ci*ck;
float gl1 = ck*sc3 - ci*sc4;
float gl2 = ci*sc6 + ck*sc5 - sc3*sc4;
float gl3 = sc3*sc6 - sc4*sc5;
float gl4 = sc5*sc6;
#ifdef APPLY_SCALE
//forceTorqueEnergy->w += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3.0f*rr3*rr1*rr1))*(gl2+gl7+gl8) + (bn3-offset*(15.0f*rr3*rr3*rr1))*(gl3+gl5) + (bn4-offset*(105.0f*rr3*rr3*rr3))*gl4);
*energy += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3.0f*rr3*rr1*rr1))*(gl2+gl7+gl8) + (bn3-offset*(15.0f*rr3*rr3*rr1))*(gl3+gl5) + (bn4-offset*(105.0f*rr3*rr3*rr3))*gl4);
#else
//forceTorqueEnergy->w += bn1*(gl1+gl6) + bn2*(gl2+gl7+gl8) + bn3*(gl3+gl5) + bn4*gl4;
*energy += forceFactor*(bn1*(gl1+gl6) + bn2*(gl2+gl7+gl8) + bn3*(gl3+gl5) + bn4*gl4);
#endif
float gf1 = bn1*gl0 + bn2*(gl1+gl6) + bn3*(gl2+gl7+gl8) + bn4*(gl3+gl5) + bn5*gl4;
#ifdef APPLY_SCALE
gf1 -= offset*(rr3*gl0 + (3.0f*rr3*rr1*rr1)*(gl1+gl6) + (15.0f*rr3*rr3*rr1)*(gl2+gl7+gl8) + (105.0f*rr3*rr3*rr3)*(gl3+gl5) + (945.0f*rr3*rr3*rr3*rr1*rr1)*gl4);
#endif
ftm21 += gf1*xr;
ftm22 += gf1*yr;
ftm23 += gf1*zr;
#ifdef APPLY_SCALE
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3 - offset*(-ck*rr3 + sc4*(3.0f*rr3*rr1*rr1) - sc6*(15.0f*rr3*rr3*rr1));
#else
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
#endif
ftm21 += gf2*di1;
ftm22 += gf2*di2;
ftm23 += gf2*di3;
#ifdef APPLY_SCALE
float gf5 = 2.0f*(-ck*bn2+sc4*bn3-sc6*bn4 - offset*(-ck*(3.0f*rr3*rr1*rr1)+sc4*(15.0f*rr3*rr3*rr1)-sc6*(105.0f*rr3*rr3*rr3)));
#else
float gf5 = 2.0f*(-ck*bn2+sc4*bn3-sc6*bn4);
#endif
ftm21 += gf5*qir1;
ftm22 += gf5*qir2;
ftm23 += gf5*qir3;
#ifdef APPLY_SCALE
float gf3 = ci*bn1 + sc3*bn2 + sc5*bn3 - offset*(ci*rr3 + sc3*(3.0f*rr3*rr1*rr1) + sc5*(15.0f*rr3*rr3*rr1));
#else
float gf3 = ci*bn1 + sc3*bn2 + sc5*bn3;
#endif
ftm21 += gf3*dk1;
ftm22 += gf3*dk2;
ftm23 += gf3*dk3;
#ifdef APPLY_SCALE
float gf6 = 2.0f*(-ci*bn2-sc3*bn3-sc5*bn4 - offset*(-ci*(3.0f*rr3*rr1*rr1)-sc3*(15.0f*rr3*rr3*rr1)-sc5*(105.0f*rr3*rr3*rr3)));
#else
float gf6 = 2.0f*(-ci*bn2-sc3*bn3-sc5*bn4);
#endif
ftm21 += gf6*qkr1;
ftm22 += gf6*qkr2;
ftm23 += gf6*qkr3;
force[0] = ftm21;
force[1] = ftm22;
force[2] = ftm23;
/*
if( forceFactor == 1.0f ){
atomJ.force[0] -= ftm21;
atomJ.force[1] -= ftm22;
atomJ.force[2] -= ftm23;
}
atomI.force[0] += ftm21;
atomI.force[1] += ftm22;
atomI.force[2] += ftm23;
*/
return;
}
static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnF2, _kernel )(
PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
float4 delta, float4 bn, float forceFactor,
#ifdef APPLY_SCALE
const float* scalingFactors,
#endif
float force[3], float* energy ){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float rr1 = delta.w;
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
// float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = 1.0f/(rr1*damp);
damp = -pgamma*ratio*ratio*ratio;
damp = damp < -50.0f ? 0.0f : damp;
}
float scale5 = (damp == 0.0f) ? 1.0f : (1.0f - (1.0f-damp)*expf(damp));
float rr5 = rr1*rr1;
rr5 = 3.0f*rr1*rr5*rr5;
#ifdef APPLY_SCALE
float psc5 = rr5*(1.0f - scale5*scalingFactors[PScaleIndex]);
float dsc5 = rr5*(1.0f - scale5*scalingFactors[DScaleIndex]);
float usc5 = rr5*(1.0f - scale5*scalingFactors[UScaleIndex]);
#else
float psc5 = rr5*(1.0f - scale5);
#endif
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float ftm21 = -bn2*(qiuk1+qiukp1);
#ifdef APPLY_SCALE
ftm21 += qiuk1*psc5 + qiukp1*dsc5;
#else
ftm21 += (qiuk1 + qiukp1)*psc5;
#endif
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float ftm22 = -bn2*(qiuk2+qiukp2);
#ifdef APPLY_SCALE
ftm22 += ((qiuk2)*psc5 + (qiukp2)*dsc5);
#else
ftm22 += (qiuk2 + qiukp2)*psc5;
#endif
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
float ftm23 = -bn2*(qiuk3+qiukp3);
#ifdef APPLY_SCALE
ftm23 += ((qiuk3)*psc5 + (qiukp3)*dsc5);
#else
ftm23 += (qiuk3 + qiukp3)*psc5;
#endif
float expdamp = expf(damp);
float scale3 = (damp == 0.0f) ? 1.0f : (1.0f - expdamp);
float rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE
float psc3 = rr3*(1.0f - scale3*scalingFactors[PScaleIndex]);
float dsc3 = rr3*(1.0f - scale3*scalingFactors[DScaleIndex]);
float usc3 = rr3*(1.0f - scale3*scalingFactors[UScaleIndex]);
#else
float psc3 = rr3*(1.0f - scale3);
#endif
float scale7 = (damp == 0.0f) ? 1.0f : (1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp);
#ifdef APPLY_SCALE
float psc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[PScaleIndex]);
float dsc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[DScaleIndex]);
#else
float psc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7);
#endif
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float sc3 = di1*xr + di2*yr + di3*zr;
float sc5 = qir1*xr + qir2*yr + qir3*zr;
float gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
float prefactor1;
prefactor1 = 0.5f*(ci*psc3 + sc3*psc5 + sc5*psc7 - gfi3);
ftm21 -= prefactor1*atomJ.inducedDipole[0];
ftm22 -= prefactor1*atomJ.inducedDipole[1];
ftm23 -= prefactor1*atomJ.inducedDipole[2];
#ifdef APPLY_SCALE
prefactor1 = 0.5f*(ci*dsc3 + sc3*dsc5 + sc5*dsc7 - gfi3);
#endif
ftm21 -= prefactor1*atomJ.inducedDipoleP[0];
ftm22 -= prefactor1*atomJ.inducedDipoleP[1];
ftm23 -= prefactor1*atomJ.inducedDipoleP[2];
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
//forceTorqueEnergy->w += 0.5f*((psc3-bn1)*(ci*sci4) + (psc5-bn2)*(sc3*sci4) + (psc7-bn3)*(sci4*sc5));
*energy += forceFactor*0.5f*sci4*((psc3-bn1)*ci + (psc5-bn2)*sc3 + (psc7-bn3)*sc5);
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
if( cAmoebaSim.polarizationType == 0 ){
#ifdef APPLY_SCALE
prefactor1 = 0.5f*( bn2 - usc5 );
#else
prefactor1 = 0.5f*( bn2 - psc5 );
#endif
ftm21 += prefactor1*( (sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0]) );
ftm22 += prefactor1*( (sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1]) );
ftm23 += prefactor1*( (sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2]) );
}
#ifdef APPLY_SCALE
prefactor1 = 0.5f*( bn2*(sci4+scip4) - (sci4*psc5+scip4*dsc5) );
#else
sci4 += scip4;
prefactor1 = 0.5f*sci4*( bn2 - psc5 );
#endif
ftm21 += prefactor1*di1;
ftm22 += prefactor1*di2;
ftm23 += prefactor1*di3;
#ifdef APPLY_SCALE
float gfi5 = bn3*(sci4+scip4) - (sci4*psc7+scip4*dsc7);
#else
float gfi5 = sci4*(bn3 - psc7);
#endif
ftm21 += gfi5*qir1;
ftm22 += gfi5*qir2;
ftm23 += gfi5*qir3;
float sci7 = qir1*atomJ.inducedDipole[0] + qir2*atomJ.inducedDipole[1] + qir3*atomJ.inducedDipole[2];
//forceTorqueEnergy->w += (bn2-psc5)*sci7;
*energy += forceFactor*(bn2-psc5)*sci7;
float scip7 = qir1*atomJ.inducedDipoleP[0] + qir2*atomJ.inducedDipoleP[1] + qir3*atomJ.inducedDipoleP[2];
#ifdef APPLY_SCALE
float gli1 = -ci*sci4;
float gli2 = -sc3*sci4 + 2.0f*sci7;
float gli3 = -sci4*sc5;
float glip1 = -ci*scip4;
float glip2 = -sc3*scip4 + 2.0f*scip7;
float glip3 = -scip4*sc5;
#else
float gli1 = -ci*sci4;
float gli2 = -sc3*sci4 + 2.0f*(sci7 + scip7);
float gli3 = -sci4*sc5;
#endif
#ifdef APPLY_SCALE
float gfi1 = (bn2*(gli1+glip1) + bn3*(gli2+glip2) + bn4*(gli3+glip3));
gfi1 -= (rr1*rr1)*( 3.0f*(gli1*psc3 + glip1*dsc3) + 5.0f*(gli2*psc5 + glip2*dsc5 ) + 7.0f*(gli3*psc7+glip3*dsc7) );
#else
float gfi1 = bn2*gli1 + bn3*gli2 + bn4*gli3;
gfi1 -= (rr1*rr1)*( 3.0f*gli1*psc3 + 5.0f*gli2*psc5 + 7.0f*gli3*psc7);
#endif
gfi1 *= 0.5f;
ftm21 += gfi1*xr;
ftm22 += gfi1*yr;
ftm23 += gfi1*zr;
if( damp != 0.0f ){
float expdamp = expf(damp);
float temp3 = -1.5f*damp*expdamp*rr1*rr1;
float temp5 = -damp;
float temp7 = -0.2f - 0.6f*damp;
float ddsc31 = temp3*xr;
float ddsc32 = temp3*yr;
float ddsc33 = temp3*zr;
float ddsc51 = temp5*ddsc31;
float ddsc52 = temp5*ddsc32;
float ddsc53 = temp5*ddsc33;
float ddsc71 = temp7*ddsc51;
float ddsc72 = temp7*ddsc52;
float ddsc73 = temp7*ddsc53;
float rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE
temp3 = (gli1*scalingFactors[PScaleIndex] + glip1*scalingFactors[DScaleIndex]);
temp5 = (3.0f*rr1*rr1)*(gli2*scalingFactors[PScaleIndex] + glip2*scalingFactors[DScaleIndex]);
temp7 = (15.0f*rr3*rr1)*(gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
#else
temp3 = gli1;
temp5 = (3.0f*rr1*rr1)*gli2;
temp7 = (15.0f*rr3*rr1)*gli3;
#endif
ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
}
//K
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
//float qk9 = atomJ.labFrameQuadrupole[5];
float qk9 = -(qk1 + qk5);
float qkui1 = qk1*atomI.inducedDipole[0] + qk2*atomI.inducedDipole[1] + qk3*atomI.inducedDipole[2];
float qkuip1 = qk1*atomI.inducedDipoleP[0] + qk2*atomI.inducedDipoleP[1] + qk3*atomI.inducedDipoleP[2];
ftm21 += bn2*(qkui1+qkuip1);
#ifdef APPLY_SCALE
ftm21 -= (qkui1*psc5 + qkuip1*dsc5);
#else
ftm21 -= (qkui1 + qkuip1)*psc5;
#endif
float qkui2 = qk2*atomI.inducedDipole[0] + qk5*atomI.inducedDipole[1] + qk6*atomI.inducedDipole[2];
float qkuip2 = qk2*atomI.inducedDipoleP[0] + qk5*atomI.inducedDipoleP[1] + qk6*atomI.inducedDipoleP[2];
ftm22 += bn2*(qkui2+qkuip2);
#ifdef APPLY_SCALE
ftm22 -= ((qkui2)*psc5 + (qkuip2)*dsc5);
#else
ftm22 -= (qkui2 + qkuip2)*psc5;
#endif
float qkui3 = qk3*atomI.inducedDipole[0] + qk6*atomI.inducedDipole[1] + qk9*atomI.inducedDipole[2];
float qkuip3 = qk3*atomI.inducedDipoleP[0] + qk6*atomI.inducedDipoleP[1] + qk9*atomI.inducedDipoleP[2];
ftm23 += bn2*(qkui3+qkuip3);
#ifdef APPLY_SCALE
ftm23 -= ((qkui3)*psc5 + (qkuip3)*dsc5);
#else
ftm23 -= (qkui3 + qkuip3)*psc5;
#endif
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
float ck = atomJ.q;
float gfi2 = (-ck*bn1 + sc4*bn2 - sc6*bn3);
prefactor1 = 0.5f*(ck*psc3 - sc4*psc5 + sc6*psc7 + gfi2);
ftm21 += prefactor1*atomI.inducedDipole[0];
ftm22 += prefactor1*atomI.inducedDipole[1];
ftm23 += prefactor1*atomI.inducedDipole[2];
#ifdef APPLY_SCALE
prefactor1 = 0.5f*(ck*dsc3 - sc4*dsc5 + sc6*dsc7 + gfi2);
#endif
ftm21 += prefactor1*atomI.inducedDipoleP[0];
ftm22 += prefactor1*atomI.inducedDipoleP[1];
ftm23 += prefactor1*atomI.inducedDipoleP[2];
float sci3 = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
*energy += forceFactor*0.5f*sci3*( ck*(bn1-psc3) - sc4*(bn2-psc5) + sc6*(bn3-psc7) );
float scip3 = atomI.inducedDipoleP[0]*xr + atomI.inducedDipoleP[1]*yr + atomI.inducedDipoleP[2]*zr;
if( cAmoebaSim.polarizationType == 0 ){
#ifdef APPLY_SCALE
prefactor1 = 0.5f*( bn2 - usc5 );
#else
prefactor1 = 0.5f*( bn2 - psc5 );
#endif
ftm21 += prefactor1*( sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0] );
ftm22 += prefactor1*( sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1] );
ftm23 += prefactor1*( sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2] );
}
float sci34;
if( cAmoebaSim.polarizationType == 0 ){
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
sci34 = (sci3*scip4+scip3*sci4);
#ifdef APPLY_SCALE
gfi1 = sci34*(usc5*(5.0f*rr1*rr1) -bn3 );
#else
gfi1 = sci34*(psc5*(5.0f*rr1*rr1) -bn3 );
#endif
} else {
gfi1 = 0.0f;
}
#ifdef APPLY_SCALE
prefactor1 = 0.5f*( bn2*(sci3+scip3) - (sci3*psc5+scip3*dsc5) );
#else
sci3 += scip3;
prefactor1 = 0.5f*sci3*( bn2 - psc5 );
#endif
ftm21 += prefactor1*dk1;
ftm22 += prefactor1*dk2;
ftm23 += prefactor1*dk3;
#ifdef APPLY_SCALE
float gfi6 = -bn3*(sci3+scip3) + (sci3*psc7+scip3*dsc7);
#else
float gfi6 = sci3*( psc7 - bn3);
#endif
ftm21 += gfi6*qkr1;
ftm22 += gfi6*qkr2;
ftm23 += gfi6*qkr3;
float sci1 = atomI.inducedDipole[0]*dk1 + atomI.inducedDipole[1]*dk2 + atomI.inducedDipole[2]*dk3 + di1*atomJ.inducedDipole[0] + di2*atomJ.inducedDipole[1] + di3*atomJ.inducedDipole[2];
//forceTorqueEnergy->w += 0.5f*( sci1*(bn1-psc3) );
*energy += forceFactor*0.5f*( sci1*(bn1-psc3) );
float sci8 = qkr1*atomI.inducedDipole[0] + qkr2*atomI.inducedDipole[1] + qkr3*atomI.inducedDipole[2];
//forceTorqueEnergy->w += sci8*(bn2-psc5);
*energy -= forceFactor*sci8*(bn2-psc5);
float scip1 = atomI.inducedDipoleP[0]*dk1 + atomI.inducedDipoleP[1]*dk2 + atomI.inducedDipoleP[2]*dk3 + di1*atomJ.inducedDipoleP[0] + di2*atomJ.inducedDipoleP[1] + di3*atomJ.inducedDipoleP[2];
#ifndef APPLY_SCALE
sci1 += scip1;
#endif
float scip2 = atomI.inducedDipole[0]*atomJ.inducedDipoleP[0] +
atomI.inducedDipole[1]*atomJ.inducedDipoleP[1] +
atomI.inducedDipole[2]*atomJ.inducedDipoleP[2] +
atomJ.inducedDipole[0]*atomI.inducedDipoleP[0] +
atomJ.inducedDipole[1]*atomI.inducedDipoleP[1] +
atomJ.inducedDipole[2]*atomI.inducedDipoleP[2];
float scip8 = qkr1*atomI.inducedDipoleP[0] + qkr2*atomI.inducedDipoleP[1] + qkr3*atomI.inducedDipoleP[2];
#ifndef APPLY_SCALE
sci8 += scip8;
#endif
gli1 = ck*sci3 + sci1;
gli2 = -(sci3*sc4 + 2.0f*sci8);
gli3 = sci3*sc6;
#ifdef APPLY_SCALE
glip1 = ck*scip3 + scip1;
glip2 = -(scip3*sc4 + 2.0f*scip8);
glip3 = scip3*sc6;
#endif
#ifdef APPLY_SCALE
gfi1 += (bn2*(gli1+glip1) + bn3*(gli2+glip2) + bn4*(gli3+glip3));
gfi1 -= (rr1*rr1)*( 3.0f*(gli1*psc3 + glip1*dsc3) + 5.0f*(gli2*psc5 + glip2*dsc5 ) + 7.0f*(gli3*psc7+glip3*dsc7) );
#else
gfi1 += (bn2*gli1 + bn3*gli2 + bn4*gli3);
gfi1 -= (rr1*rr1)*( 3.0f*gli1*psc3 + 5.0f*gli2*psc5 + 7.0f*gli3*psc7 );
#endif
if( cAmoebaSim.polarizationType == 0 ){
#ifdef APPLY_SCALE
gfi1 += scip2*(bn2 - (3.0f*rr1*rr1)*usc3);
#else
gfi1 += scip2*(bn2 - (3.0f*rr1*rr1)*psc3);
#endif
}
gfi1 *= 0.5f;
ftm21 += gfi1*xr;
ftm22 += gfi1*yr;
ftm23 += gfi1*zr;
if( damp != 0.0f ){
float expdamp = expf(damp);
float temp3 = -1.5f*damp*expdamp*rr1*rr1;
float temp5 = -damp;
float temp7 = -0.2f - 0.6f*damp;
float ddsc31 = temp3*xr;
float ddsc32 = temp3*yr;
float ddsc33 = temp3*zr;
float ddsc51 = temp5*ddsc31;
float ddsc52 = temp5*ddsc32;
float ddsc53 = temp5*ddsc33;
float ddsc71 = temp7*ddsc51;
float ddsc72 = temp7*ddsc52;
float ddsc73 = temp7*ddsc53;
float rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE
temp3 = gli1*scalingFactors[PScaleIndex] + glip1*scalingFactors[DScaleIndex];
temp5 = (3.0f*rr1*rr1)*( gli2*scalingFactors[PScaleIndex] + glip2*scalingFactors[DScaleIndex]);
temp7 = (15.0f*rr3*rr1)*(gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
#else
temp3 = gli1;
temp5 = (3.0f*rr1*rr1)*gli2;
temp7 = (15.0f*rr3*rr1)*(gli3);
#endif
ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
if( cAmoebaSim.polarizationType == 0 ){
#ifdef APPLY_SCALE
temp3 = scalingFactors[UScaleIndex]*scip2;
temp5 = -(3.0f*rr1*rr1)*scalingFactors[UScaleIndex]*sci34;
#else
temp3 = scip2;
temp5 = -(3.0f*rr1*rr1)*sci34;
#endif
ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
}
}
force[0] += ftm21;
force[1] += ftm22;
force[2] += ftm23;
/*
if( forceFactor == 1.0f ){
atomJ.force[0] -= ftm21;
atomJ.force[1] -= ftm22;
atomJ.force[2] -= ftm23;
}
atomI.force[0] += ftm21;
atomI.force[1] += ftm22;
atomI.force[2] += ftm23;
*/
/*
forceTorqueEnergy->x += ftm21;
forceTorqueEnergy->y += ftm22;
forceTorqueEnergy->z += ftm23;
*/
return;
}
static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnT1, _kernel )(
PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn
#ifdef APPLY_SCALE
, const float* scalingFactors
#endif
){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
#ifdef APPLY_SCALE
float rr1 = delta.w;
#endif
// set the permanent multipole and induced dipole values;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
//float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float ck = atomJ.q;
float dk1 = atomJ.labFrameDipole[0];
float dk2 = atomJ.labFrameDipole[1];
float dk3 = atomJ.labFrameDipole[2];
float qk1 = atomJ.labFrameQuadrupole[0];
float qk2 = atomJ.labFrameQuadrupole[1];
float qk3 = atomJ.labFrameQuadrupole[2];
float qk5 = atomJ.labFrameQuadrupole[3];
float qk6 = atomJ.labFrameQuadrupole[4];
//float qk9 = atomJ.labFrameQuadrupole[5];
float qk9 = -(atomJ.labFrameQuadrupole[0] + atomJ.labFrameQuadrupole[3]);
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
float bn4 = bn.w;
// apply Thole polarization damping to scale factors
#ifdef APPLY_SCALE
float rr2 = rr1*rr1;
float rr3 = rr1*rr2;
float rr5 = 3.0f*rr3*rr2;
float rr7 = 5.0f*rr5*rr2;
float rr9 = 7.0f*rr7*rr2;
float scale = 1.0f-scalingFactors[MScaleIndex];
float prefactor = scale*rr3 - bn1;
#else
float prefactor = -bn1;
#endif
float dixdk1 = di2*dk3 - di3*dk2;
float ttm21 = prefactor*dixdk1;
float dixdk2 = di3*dk1 - di1*dk3;
float ttm22 = prefactor*dixdk2;
float dixdk3 = di1*dk2 - di2*dk1;
float ttm23 = prefactor*dixdk3;
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
float qkr1 = qk1*xr + qk2*yr + qk3*zr;
float qkr2 = qk2*xr + qk5*yr + qk6*zr;
float qkr3 = qk3*xr + qk6*yr + qk9*zr;
float qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
float qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
float qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
float rxqikr1 = yr*qiqkr3 - zr*qiqkr2;
float qkrxqir1 = qkr2*qir3 - qkr3*qir2;
#ifdef APPLY_SCALE
prefactor = 4.0f*(bn3 - scale*rr7);
#else
prefactor = 4.0f*bn3;
#endif
ttm21 -= prefactor*(rxqikr1+qkrxqir1);
float rxqikr2 = zr*qiqkr1 - xr*qiqkr3;
float qkrxqir2 = qkr3*qir1 - qkr1*qir3;
ttm22 -= prefactor*(rxqikr2+qkrxqir2);
float rxqikr3 = xr*qiqkr2 - yr*qiqkr1;
float qkrxqir3 = qkr1*qir2 - qkr2*qir1;
ttm23 -= prefactor*(rxqikr3+qkrxqir3);
float qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
float qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
float qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
float dixqkr1 = di2*qkr3 - di3*qkr2;
float dkxqir1 = dk2*qir3 - dk3*qir2;
float rxqidk1 = yr*qidk3 - zr*qidk2;
float qixqk1 = qi2*qk3 + qi5*qk6 + qi6*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk6;
#ifdef APPLY_SCALE
prefactor = 2.0f*(bn2 - scale*rr5);
#else
prefactor = 2.0f*bn2;
#endif
ttm21 += prefactor*(dixqkr1+dkxqir1+rxqidk1-2.0f*qixqk1);
float dixqkr2 = di3*qkr1 - di1*qkr3;
float dkxqir2 = dk3*qir1 - dk1*qir3;
float rxqidk2 = zr*qidk1 - xr*qidk3;
float qixqk2 = qi3*qk1 + qi6*qk2 + qi9*qk3 - qi1*qk3 - qi2*qk6 - qi3*qk9;
ttm22 += prefactor*(dixqkr2+dkxqir2+rxqidk2-2.0f*qixqk2);
float dixqkr3 = di1*qkr2 - di2*qkr1;
float dkxqir3 = dk1*qir2 - dk2*qir1;
float rxqidk3 = xr*qidk2 - yr*qidk1;
float qixqk3 = qi1*qk2 + qi2*qk5 + qi3*qk6 - qi2*qk1 - qi5*qk2 - qi6*qk3;
ttm23 += prefactor*(dixqkr3+dkxqir3+rxqidk3-2.0f*qixqk3);
float sc4 = dk1*xr + dk2*yr + dk3*zr;
float sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
float gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
#ifdef APPLY_SCALE
float gfr2 = -ck*rr3 + sc4*rr5 - sc6*rr7;
prefactor = (gf2 - scale*gfr2);
#else
prefactor = gf2;
#endif
ttm21 += prefactor*(di2*zr - di3*yr);
ttm22 += prefactor*(di3*xr - di1*zr);
ttm23 += prefactor*(di1*yr - di2*xr);
float gf5 = (-ck*bn2+sc4*bn3-sc6*bn4);
#ifdef APPLY_SCALE
float gfr5 = (-ck*rr5+sc4*rr7-sc6*rr9);
prefactor = 2.0f*(gf5 - scale*gfr5);
#else
prefactor = 2.0f*gf5;
#endif
float rxqir1 = yr*qir3 - zr*qir2;
float rxqir2 = zr*qir1 - xr*qir3;
float rxqir3 = xr*qir2 - yr*qir1;
ttm21 -= prefactor*rxqir1;
ttm22 -= prefactor*rxqir2;
ttm23 -= prefactor*rxqir3;
atomI.torque[0] += ttm21;
atomI.torque[1] += ttm22;
atomI.torque[2] += ttm23;
/*
torque[0] = ttm21;
torque[1] = ttm22;
torque[2] = ttm23;
*/
return;
}
static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnT2, _kernel)(
PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
const float4 delta, const float4 bn
#ifdef APPLY_SCALE
, const float* scalingFactors
#endif
){
float xr = delta.x;
float yr = delta.y;
float zr = delta.z;
float rr1 = delta.w;
// set the permanent multipole and induced dipole values;
float di1 = atomI.labFrameDipole[0];
float di2 = atomI.labFrameDipole[1];
float di3 = atomI.labFrameDipole[2];
float qi1 = atomI.labFrameQuadrupole[0];
float qi2 = atomI.labFrameQuadrupole[1];
float qi3 = atomI.labFrameQuadrupole[2];
float qi5 = atomI.labFrameQuadrupole[3];
float qi6 = atomI.labFrameQuadrupole[4];
//float qi9 = atomI.labFrameQuadrupole[5];
float qi9 = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
float bn1 = bn.x;
float bn2 = bn.y;
float bn3 = bn.z;
// apply Thole polarization damping to scale factors
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
float ratio = 1.0f/(rr1*damp);
damp = -pgamma*ratio*ratio*ratio;
if( damp > -50.0f ){
float expdamp = expf(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - (1.0f-damp)*expdamp;
scale7 = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
}
}
float rr3 = rr1*rr1*rr1;
#ifdef APPLY_SCALE
float dsc3 = rr3*(1.0f - scale3*scalingFactors[DScaleIndex]);
float dsc5 = (3.0f*rr3*rr1*rr1)* (1.0f - scale5*scalingFactors[DScaleIndex]);
float dsc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[DScaleIndex]);
float psc3 = rr3*(1.0f - scale3*scalingFactors[PScaleIndex]);
float psc5 = (3.0f*rr3*rr1*rr1)*(1.0f - scale5*scalingFactors[PScaleIndex]);
float psc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[PScaleIndex]);
#else
float psc3 = rr3*(1.0f - scale3);
float psc5 = (3.0f*rr3*rr1*rr1)*(1.0f - scale5);
float psc7 = (15.0f*rr3*rr3*rr1)*(1.0f - scale7);
#endif
float prefactor1 = 0.5f*(psc3 - bn1);
#ifdef APPLY_SCALE
float prefactor2 = 0.5f*(dsc3 - bn1);
#endif
float dixuk1 = di2*atomJ.inducedDipole[2] - di3*atomJ.inducedDipole[1];
float dixukp1 = di2*atomJ.inducedDipoleP[2] - di3*atomJ.inducedDipoleP[1];
#ifdef APPLY_SCALE
float ttm2i1 = prefactor1*dixuk1 + prefactor2*dixukp1;
#else
float ttm2i1 = prefactor1*(dixuk1 + dixukp1);
#endif
float dixuk2 = di3*atomJ.inducedDipole[0] - di1*atomJ.inducedDipole[2];
float dixukp2 = di3*atomJ.inducedDipoleP[0] - di1*atomJ.inducedDipoleP[2];
#ifdef APPLY_SCALE
float ttm2i2 = prefactor1*dixuk2 + prefactor2*dixukp2;
#else
float ttm2i2 = prefactor1*(dixuk2 + dixukp2);
#endif
float dixuk3 = di1*atomJ.inducedDipole[1] - di2*atomJ.inducedDipole[0];
float dixukp3 = di1*atomJ.inducedDipoleP[1] - di2*atomJ.inducedDipoleP[0];
#ifdef APPLY_SCALE
float ttm2i3 = prefactor1*dixuk3 + prefactor2*dixukp3;
#else
float ttm2i3 = prefactor1*(dixuk3 + dixukp3);
#endif
float sci4 = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float scip4 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
float gti2 = bn2*(sci4+scip4);
#ifdef APPLY_SCALE
float gtri2 = (sci4*psc5+scip4*dsc5);
#else
float gtri2 = psc5*(sci4+scip4);
#endif
prefactor1 = 0.5f*(gti2 - gtri2);
ttm2i1 += prefactor1*( di2*zr - di3*yr );
ttm2i2 += prefactor1*( di3*xr - di1*zr );
ttm2i3 += prefactor1*( di1*yr - di2*xr );
float qir1 = qi1*xr + qi2*yr + qi3*zr;
float qir2 = qi2*xr + qi5*yr + qi6*zr;
float qir3 = qi3*xr + qi6*yr + qi9*zr;
#ifdef APPLY_SCALE
prefactor1 = sci4*psc7 + scip4*dsc7 - bn3*(sci4+scip4);
#else
prefactor1 = psc7*(sci4+scip4) - bn3*(sci4+scip4);
#endif
ttm2i1 += prefactor1*( yr*qir3 - zr*qir2 );
ttm2i2 += prefactor1*( zr*qir1 - xr*qir3 );
ttm2i3 += prefactor1*( xr*qir2 - yr*qir1 );
float qiuk1 = qi1*atomJ.inducedDipole[0] + qi2*atomJ.inducedDipole[1] + qi3*atomJ.inducedDipole[2];
float qiuk2 = qi2*atomJ.inducedDipole[0] + qi5*atomJ.inducedDipole[1] + qi6*atomJ.inducedDipole[2];
float qiuk3 = qi3*atomJ.inducedDipole[0] + qi6*atomJ.inducedDipole[1] + qi9*atomJ.inducedDipole[2];
float qiukp1 = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
float qiukp2 = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
float qiukp3 = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
prefactor1 = (bn2 - psc5);
#ifdef APPLY_SCALE
prefactor2 = (bn2 - dsc5);
#endif
float ukxqir1 = atomJ.inducedDipole[1]*qir3 - atomJ.inducedDipole[2]*qir2;
float ukxqirp1 = atomJ.inducedDipoleP[1]*qir3 - atomJ.inducedDipoleP[2]*qir2;
float rxqiuk1 = yr*qiuk3 - zr*qiuk2;
float rxqiukp1 = yr*qiukp3 - zr*qiukp2;
#ifdef APPLY_SCALE
ttm2i1 += prefactor1*(ukxqir1 + rxqiuk1) + prefactor2*(ukxqirp1 + rxqiukp1);
#else
ttm2i1 += prefactor1*( ukxqir1 + rxqiuk1 + ukxqirp1 + rxqiukp1 );
#endif
float ukxqir2 = atomJ.inducedDipole[2]*qir1 - atomJ.inducedDipole[0]*qir3;
float ukxqirp2 = atomJ.inducedDipoleP[2]*qir1 - atomJ.inducedDipoleP[0]*qir3;
float rxqiuk2 = zr*qiuk1 - xr*qiuk3;
float rxqiukp2 = zr*qiukp1 - xr*qiukp3;
#ifdef APPLY_SCALE
ttm2i2 += prefactor1*(ukxqir2 + rxqiuk2) + prefactor2*(ukxqirp2 + rxqiukp2);
#else
ttm2i2 += prefactor1*( ukxqir2 + rxqiuk2 + ukxqirp2 + rxqiukp2 );
#endif
float ukxqir3 = atomJ.inducedDipole[0]*qir2 - atomJ.inducedDipole[1]*qir1;
float ukxqirp3 = atomJ.inducedDipoleP[0]*qir2 - atomJ.inducedDipoleP[1]*qir1;
float rxqiuk3 = xr*qiuk2 - yr*qiuk1;
float rxqiukp3 = xr*qiukp2 - yr*qiukp1;
#ifdef APPLY_SCALE
ttm2i3 += prefactor1*(ukxqir3 + rxqiuk3) + prefactor2*(ukxqirp3 + rxqiukp3);
#else
ttm2i3 += prefactor1*(ukxqir3 + rxqiuk3 + ukxqirp3 + rxqiukp3 );
#endif
atomI.torque[0] += ttm2i1;
atomI.torque[1] += ttm2i2;
atomI.torque[2] += ttm2i3;
/*
torque[0] += ttm2i1;
torque[1] += ttm2i2;
torque[2] += ttm2i3;
*/
return;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* EFieldReciprocal, float* fieldIn, float* fieldOut )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
// Reduce field
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
//const float term = 0.0f;
while (pos < fieldComponents)
{
// self-term included here
float totalField = EFieldReciprocal[pos] + term*cAmoebaSim.pLabFrameDipole[pos];
float* pFt = fieldIn + pos;
unsigned int i = outputBuffers;
while (i >= 4)
{
totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
pFt += fieldComponents*4;
i -= 4;
}
if (i >= 2)
{
totalField += pFt[0] + pFt[fieldComponents];
pFt += fieldComponents*2;
i -= 2;
}
if (i > 0)
{
totalField += pFt[0];
}
fieldOut[pos] = totalField;
pos += gridDim.x * blockDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn, float* fieldOut )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
// Reduce field
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
//const float term = 0.0;
while (pos < fieldComponents)
{
// self-term included here
float totalField = term*cAmoebaSim.pLabFrameDipole[pos];
float* pFt = fieldIn + pos;
unsigned int i = outputBuffers;
while (i >= 4)
{
totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
pFt += fieldComponents*4;
i -= 4;
}
if (i >= 2)
{
totalField += pFt[0] + pFt[fieldComponents];
pFt += fieldComponents*2;
i -= 2;
}
if (i > 0)
{
totalField += pFt[0];
}
fieldOut[pos] += totalField;
pos += gridDim.x * blockDim.x;
}
}
// reduce psWorkArray_3_1 -> EField
// reduce psWorkArray_3_2 -> EFieldPolar
static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
{
gpuContext gpu = amoebaGpu->gpuContext;
// E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self
kReducePmeEFieldPolar_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
LAUNCHERROR("kReducePmeE_Fields1");
// E_Field = E_Field (reciprocal) + E_Field (direct) + self
kReducePmeEField_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
LAUNCHERROR("kReducePmeE_Fields2");
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
#undef GK
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
#include "kCalculateAmoebaCudaFixedFieldParticle.h"
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
float dscale, float pscale, float4 fields[3]){
// compute the real space portion of the Ewald summation
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic boundary conditions
xr -= floorf(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floorf(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floorf(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn0 = erfcf(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = expf(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn3 = (5.0f*bn2+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = expf(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
scale7 = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
}
}
float dsc3 = dscale*scale3;
float dsc5 = dscale*scale5;
float dsc7 = dscale*scale7;
float psc3 = pscale*scale3;
float psc5 = pscale*scale5;
float psc7 = pscale*scale7;
float r3 = (r*r2);
float r5 = (r3*r2);
float r7 = (r5*r2);
float drr3 = (1.0f-dsc3)/r3;
float drr5 = 3.0f * (1.0f-dsc5)/r5;
float drr7 = 15.0f * (1.0f-dsc7)/r7;
float prr3 = (1.0f-psc3) / r3;
float prr5 = 3.0f *(1.0f-psc5)/r5;
float prr7 = 15.0f*(1.0f-psc7)/r7;
float dir = atomI.labFrameDipole_X*xr + atomI.labFrameDipole_Y*yr + atomI.labFrameDipole_Z*zr;
float qix = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
float qiy = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
float qiz = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
float qir = qix*xr + qiy*yr + qiz*zr;
float dkr = atomJ.labFrameDipole_X*xr + atomJ.labFrameDipole_Y*yr + atomJ.labFrameDipole_Z*zr;
float qkx = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
float qky = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
float qkz = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
float qkr = qkx*xr + qky*yr + qkz*zr;
float fim0 = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) - bn1*atomJ.labFrameDipole_X + 2.0f*bn2*qkx;
float fim1 = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) - bn1*atomJ.labFrameDipole_Y + 2.0f*bn2*qky;
float fim2 = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr) - bn1*atomJ.labFrameDipole_Z + 2.0f*bn2*qkz;
float fkm0 = xr*(bn1*atomI.q+bn2*dir+bn3*qir) - bn1*atomI.labFrameDipole_X - 2.0f*bn2*qix;
float fkm1 = yr*(bn1*atomI.q+bn2*dir+bn3*qir) - bn1*atomI.labFrameDipole_Y - 2.0f*bn2*qiy;
float fkm2 = zr*(bn1*atomI.q+bn2*dir+bn3*qir) - bn1*atomI.labFrameDipole_Z - 2.0f*bn2*qiz;
float fid0 = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
float fid1 = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
float fid2 = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
float fkd0 = xr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
float fkd1 = yr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
float fkd2 = zr*(drr3*atomI.q+drr5*dir+drr7*qir) - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
float fip0 = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
float fip1 = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
float fip2 = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
float fkp0 = xr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
float fkp1 = yr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
float fkp2 = zr*(prr3*atomI.q+prr5*dir+prr7*qir) - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
// increment the field at each site due to this interaction
fields[0].x = fim0 - fid0;
fields[1].x = fim1 - fid1;
fields[2].x = fim2 - fid2;
fields[0].y = fkm0 - fkd0;
fields[1].y = fkm1 - fkd1;
fields[2].y = fkm2 - fkd2;
fields[0].z = fim0 - fip0;
fields[1].z = fim1 - fip1;
fields[2].z = fim2 - fip2;
fields[0].w = fkm0 - fkp0;
fields[1].w = fkm1 - fkp1;
fields[2].w = fkm2 - fkp2;
} else {
fields[0].x = 0.0f;
fields[0].y = 0.0f;
fields[0].z = 0.0f;
fields[0].w = 0.0f;
fields[1].x = 0.0f;
fields[1].y = 0.0f;
fields[1].z = 0.0f;
fields[1].w = 0.0f;
fields[2].x = 0.0f;
fields[2].y = 0.0f;
fields[2].z = 0.0f;
fields[2].w = 0.0f;
}
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateAmoebaCudaPmeFixedEField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateAmoebaCudaPmeFixedEField.h"
/**---------------------------------------------------------------------------------------
Report whether a number is a nan or infinity
@param number number to test
@return 1 if number is nan or infinity; else return 0
--------------------------------------------------------------------------------------- */
/**---------------------------------------------------------------------------------------
Compute fixed electric field using PME
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
{
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 192;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData );
} else {
kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData );
}
LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
kReducePmeDirectE_Fields( amoebaGpu );
}
void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
{
kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaScaleFactors.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(384, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(192, 1)
#else
__launch_bounds__(64, 1)
#endif
void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned int* workUnit,
float* outputEField,
float* outputEFieldPolar){
extern __shared__ FixedFieldParticle sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end)
{
unsigned int x;
unsigned int y;
bool bExclusionFlag;
float dScaleValue;
float pScaleValue;
int dScaleMask;
int2 pScaleMask;
// extract cell coordinates
decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
FixedFieldParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx;
FixedFieldParticle localParticle;
loadFixedFieldShared( &localParticle, atomI );
float fieldSum[3];
float fieldPolarSum[3];
fieldSum[0] = 0.0f;
fieldSum[1] = 0.0f;
fieldSum[2] = 0.0f;
fieldPolarSum[0] = 0.0f;
fieldPolarSum[1] = 0.0f;
fieldPolarSum[2] = 0.0f;
if (x == y)
{
// load coordinates, charge, ...
loadFixedFieldShared( &(sA[threadIdx.x]), atomI );
if( bExclusionFlag ){
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
dScaleValue = pScaleValue = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++)
{
if( bExclusionFlag ){
getMaskedDScaleFactor( j, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( j, pScaleMask, &pScaleValue );
}
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
unsigned int match = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += match ? 0.0f : ijField[0].x;
fieldSum[1] += match ? 0.0f : ijField[1].x;
fieldSum[2] += match ? 0.0f : ijField[2].x;
fieldPolarSum[0] += match ? 0.0f : ijField[0].z;
fieldPolarSum[1] += match ? 0.0f : ijField[1].z;
fieldPolarSum[2] += match ? 0.0f : ijField[2].z;
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
#endif
} else {
if (lasty != y ) {
// load coordinates, charge, ...
loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
}
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
// No interactions in this block.
} else {
// zero shared fields
zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
if( bExclusionFlag ) {
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
dScaleValue = pScaleValue = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++){
if ((flags&(1<<j)) != 0) {
unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
if( bExclusionFlag ){
getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
}
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField);
unsigned int outOfBounds = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0].x;
fieldSum[1] += outOfBounds ? 0.0f : ijField[1].x;
fieldSum[2] += outOfBounds ? 0.0f : ijField[2].x;
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[0].z;
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[1].z;
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2].z;
if( flags == 0xFFFFFFFF ){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[0].y;
psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1].y;
psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[2].y;
psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[0].w;
psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[1].w;
psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[2].w;
} else {
sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].eField[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
}
}
tj = (tj + 1) & (GRID - 1);
} // j-loop block
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField, outputEField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].eField, outputEField );
load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#endif
} // end of pInteractionFlag block
lasty = y;
} // x == y block
pos++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
#include "openmm/OpenMMException.h"
#include <stdio.h>
using namespace std;
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#ifdef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
#endif
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void setupMutualInducedFieldPairIxn_kernel( const MutualInducedParticle& atomI, const MutualInducedParticle& atomJ,
const float uscale, float4* delta, float* preFactor2 ) {
// compute thedelta->xeal space portion of the Ewald summation
delta->x = atomJ.x - atomI.x;
delta->y = atomJ.y - atomI.y;
delta->z = atomJ.z - atomI.z;
// pdelta->xiodic boundary conditions
delta->x -= floorf(delta->x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
delta->y -= floorf(delta->y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
delta->z -= floorf(delta->z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = (delta->x*delta->x) + (delta->y*delta->y) + (delta->z*delta->z);
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn0 = erfcf(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = expf(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = expf(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
}
}
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f*(1.0f-dsc5)/r5;
delta->w = rr3 - bn1;
*preFactor2 = bn2 - rr5;
} else {
delta->w = *preFactor2 = 0.0f;
}
}
__device__ void calculateMutualInducedFieldPairIxn_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
float preFactor3 = preFactor2*(inducedDipole[0]*delta.x + inducedDipole[1]*delta.y + inducedDipole[2]*delta.z);
fieldSum[0] += preFactor3*delta.x + delta.w*inducedDipole[0];
fieldSum[1] += preFactor3*delta.y + delta.w*inducedDipole[1];
fieldSum[2] += preFactor3*delta.z + delta.w*inducedDipole[2];
}
__device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
float preFactor3 = preFactor2*(inducedDipole[0]*delta.x + inducedDipole[1]*delta.y + inducedDipole[2]*delta.z);
fieldSum[0] = preFactor3*delta.x + delta.w*inducedDipole[0];
fieldSum[1] = preFactor3*delta.y + delta.w*inducedDipole[1];
fieldSum[2] = preFactor3*delta.z + delta.w*inducedDipole[2];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float uscale, float4 fields[3] ){
// compute the real space portion of the Ewald summation
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic boundary conditions
xr -= floorf(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floorf(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floorf(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn0 = erfcf(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = expf(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = expf(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
}
}
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f*(1.0f-dsc5)/r5;
float preFactor1 = rr3 - bn1;
float preFactor2 = bn2 - rr5;
float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float preFactor3 = preFactor2*dukr;
fields[0].x = preFactor3*xr + preFactor1*atomJ.inducedDipole[0];
fields[1].x = preFactor3*yr + preFactor1*atomJ.inducedDipole[1];
fields[2].x = preFactor3*zr + preFactor1*atomJ.inducedDipole[2];
float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
preFactor3 = preFactor2*duir;
fields[0].y = preFactor3*xr + preFactor1*atomI.inducedDipole[0];
fields[1].y = preFactor3*yr + preFactor1*atomI.inducedDipole[1];
fields[2].y = preFactor3*zr + preFactor1*atomI.inducedDipole[2];
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
preFactor3 = preFactor2*pukr;
fields[0].z = preFactor3*xr + preFactor1*atomJ.inducedDipolePolar[0];
fields[1].z = preFactor3*yr + preFactor1*atomJ.inducedDipolePolar[1];
fields[2].z = preFactor3*zr + preFactor1*atomJ.inducedDipolePolar[2];
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
preFactor3 = preFactor2*puir;
fields[0].w = preFactor3*xr + preFactor1*atomI.inducedDipolePolar[0];
fields[1].w = preFactor3*yr + preFactor1*atomI.inducedDipolePolar[1];
fields[2].w = preFactor3*zr + preFactor1*atomI.inducedDipolePolar[2];
} else {
fields[0].x = 0.0f;
fields[0].y = 0.0f;
fields[0].z = 0.0f;
fields[0].w = 0.0f;
fields[1].x = 0.0f;
fields[1].y = 0.0f;
fields[1].z = 0.0f;
fields[1].w = 0.0f;
fields[2].x = 0.0f;
fields[2].y = 0.0f;
fields[2].z = 0.0f;
fields[2].w = 0.0f;
}
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kInitializeMutualInducedField_kernel(
int numberOfAtoms,
float* fixedEField,
float* fixedEFieldPolar,
float* polarizability )
{
int pos = blockIdx.x*blockDim.x + threadIdx.x;
while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
fixedEFieldPolar[pos] *= polarizability[pos];
pos += blockDim.x*gridDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDeltas1, float* arrayOfDeltas2, float* epsilon )
{
extern __shared__ float2 delta[];
delta[threadIdx.x].x = 0.0f;
delta[threadIdx.x].y = 0.0f;
unsigned int pos = threadIdx.x;
// load deltas
while( pos < numberOfEntries )
{
delta[threadIdx.x].x += arrayOfDeltas1[pos];
delta[threadIdx.x].y += arrayOfDeltas2[pos];
pos += blockDim.x*gridDim.x;
}
__syncthreads();
// sum the deltas
for (int offset = 1; offset < blockDim.x; offset *= 2 )
{
if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
{
delta[threadIdx.x].x += delta[threadIdx.x+offset].x;
delta[threadIdx.x].y += delta[threadIdx.x+offset].y;
}
__syncthreads();
}
// set epsilons
if (threadIdx.x == 0)
{
epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
}
}
/**
matrixProduct/matrixProductP contains epsilon**2 on output
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kSorUpdateMutualInducedField_kernel(
float* polarizability,
float* inducedDipole, float* inducedDipoleP,
float* fixedEField, float* fixedEFieldP,
float* matrixProduct, float* matrixProductP )
{
int pos = blockIdx.x*blockDim.x + threadIdx.x;
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
const float polarSOR = 0.55f;
while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[pos];
// add self terms to fields
float mProd = matrixProduct[pos];
float mProdP = matrixProductP[pos];
mProd += term*previousDipole;
mProdP += term*previousDipoleP;
float inducedDipoleI = fixedEField[pos] + polarizability[pos]*mProd;
float inducedDipoleIP = fixedEFieldP[pos] + polarizability[pos]*mProdP;
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipoleI - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleIP - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
}
// reduce psWorkArray_3_1
// reduce psWorkArray_3_2
static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReducePmeMI_Fields1");
kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReducePmeMI_Fields2");
}
/**---------------------------------------------------------------------------------------
Compute mutual induce field
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData );
} else {
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData );
}
LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
}
/**---------------------------------------------------------------------------------------
Compute mutual induce field
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
int done;
int iteration;
gpuContext gpu = amoebaGpu->gpuContext;
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
gpu->natoms,
amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psPolarizability->_pDevData );
LAUNCHERROR("AmoebaPmeMutualInducedFieldSetup");
cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psE_Field->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
// if polarization type is direct, set flags signalling done and return
if( amoebaGpu->amoebaSim.polarizationType )
{
amoebaGpu->mutualInducedDone = 1;
amoebaGpu->mutualInducedConverged = 1;
kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
return;
}
// ---------------------------------------------------------------------------------------
done = 0;
iteration = 1;
while( !done ){
// apply SOR
cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("kSorUpdatePmeMutualInducedField");
// get total epsilon -- performing sums on gpu
kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReducePmeMutualInducedFieldDelta");
// Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0];
amoebaGpu->mutualInducedCurrentEpsilon = currentEpsilon;
if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){
done = 1;
}
// throw exception if nan detected
if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){
throw OpenMM::OpenMMException("PME induced dipole calculation detected nans." );
}
iteration++;
}
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
}
void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )
{
if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpu );
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment