Deleted the old CUDA platform

2e451b9d · Peter Eastman · 352e2fc7 · 352e2fc7 · 352e2fc7 · 352e2fc7
Commit 2e451b9d authored Dec 13, 2012 by Peter Eastman
7 changed files
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostaticF1.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostaticF1.h
-
-static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnF1, _kernel)( PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
-                                                                                            float4 delta, float4 bn, float bn5, float forceFactor,
-#ifdef APPLY_SCALE
-                                                                                            const float* scalingFactors,
-#endif
-                                                                                            float force[3], float* energy ){
-
-    float xr                 = delta.x;
-    float yr                 = delta.y;
-    float zr                 = delta.z;
-#ifdef APPLY_SCALE
-    float rr1                = delta.w;
-#endif
-
-    // set the permanent multipole and induced dipole values;
-
-    float ci                 = atomI.q;
-
-    float di1                = atomI.labFrameDipole[0];
-    float di2                = atomI.labFrameDipole[1];
-    float di3                = atomI.labFrameDipole[2];
-
-    float qi1                = atomI.labFrameQuadrupole[0];
-    float qi2                = atomI.labFrameQuadrupole[1];
-    float qi3                = atomI.labFrameQuadrupole[2];
-    float qi5                = atomI.labFrameQuadrupole[3];
-    float qi6                = atomI.labFrameQuadrupole[4];
-    //float qi9                = atomI.labFrameQuadrupole[5];
-    float qi9                   = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
-
-    float ck                 = atomJ.q;
-    float dk1                = atomJ.labFrameDipole[0];
-    float dk2                = atomJ.labFrameDipole[1];
-    float dk3                = atomJ.labFrameDipole[2];
-
-    float qk1                = atomJ.labFrameQuadrupole[0];
-    float qk2                = atomJ.labFrameQuadrupole[1];
-    float qk3                = atomJ.labFrameQuadrupole[2];
-    float qk5                = atomJ.labFrameQuadrupole[3];
-    float qk6                = atomJ.labFrameQuadrupole[4];
-//    float qk9                = atomJ.labFrameQuadrupole[5];
-    float qk9                = -(atomJ.labFrameQuadrupole[0] + atomJ.labFrameQuadrupole[3]);
-
-    float bn1                = bn.x;
-    float bn2                = bn.y;
-    float bn3                = bn.z;
-    float bn4                = bn.w;
-
-#ifdef APPLY_SCALE
-    float offset             = 1.0f-scalingFactors[MScaleIndex];
-    float rr3                = rr1*rr1*rr1;
-    float gf4                = 2.0f*(bn2 - 3.0f*offset*rr3*rr1*rr1);
-#else
-    float gf4                = 2.0f*bn2;
-#endif
-    float qidk1              = qi1*dk1 + qi2*dk2 + qi3*dk3;
-    float qkdi1              = qk1*di1 + qk2*di2 + qk3*di3;
-    float ftm21              = gf4*(qkdi1-qidk1);
-
-    float qidk2              = qi2*dk1 + qi5*dk2 + qi6*dk3;
-    float qkdi2              = qk2*di1 + qk5*di2 + qk6*di3;
-    float ftm22              = gf4*(qkdi2-qidk2);
-
-    float qidk3              = qi3*dk1 + qi6*dk2 + qi9*dk3;
-    float qkdi3              = qk3*di1 + qk6*di2 + qk9*di3;
-    float ftm23              = gf4*(qkdi3-qidk3);
-
-    float qir1               = qi1*xr + qi2*yr + qi3*zr;
-    float qir2               = qi2*xr + qi5*yr + qi6*zr;
-    float qir3               = qi3*xr + qi6*yr + qi9*zr;
-
-    float qkr1               = qk1*xr + qk2*yr + qk3*zr;
-    float qkr2               = qk2*xr + qk5*yr + qk6*zr;
-    float qkr3               = qk3*xr + qk6*yr + qk9*zr;
-
-#ifdef APPLY_SCALE
-    float gf7                = 4.0f*(bn3 - 15.0f*offset*rr3*rr3*rr1);
-#else
-    float gf7                = 4.0f*bn3;
-#endif
-    float qiqkr1             = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
-    float qkqir1             = qk1*qir1 + qk2*qir2 + qk3*qir3;
-    ftm21                   += gf7*(qiqkr1+qkqir1);
-
-    float qiqkr2             = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
-    float qkqir2             = qk2*qir1 + qk5*qir2 + qk6*qir3;
-    ftm22                   += gf7*(qiqkr2+qkqir2);
-
-    float qiqkr3             = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
-    float qkqir3             = qk3*qir1 + qk6*qir2 + qk9*qir3;
-    ftm23                   += gf7*(qiqkr3+qkqir3);
-
-    // calculate the scalar products for permanent components
-
-    float gl6                = di1*dk1   + di2*dk2   + di3*dk3;
-    float gl7                =  2.0f*( qir1*dk1  + qir2*dk2  + qir3*dk3 - ( qkr1*di1  + qkr2*di2  + qkr3*di3 ) );
-    float gl5                = -4.0f*(qir1*qkr1 + qir2*qkr2 + qir3*qkr3);
-
-    float gl8                =  2.0f*(qi1*qk1 + qi2*qk2 + qi3*qk3 + qi2*qk2 + qi5*qk5 + qi6*qk6 + qi3*qk3 + qi6*qk6 + qi9*qk9 );
-
-    float sc3                = di1*xr  + di2*yr  + di3*zr;
-    float sc5                = qir1*xr + qir2*yr + qir3*zr;
-    float sc4                = dk1*xr  + dk2*yr  + dk3*zr;
-    float sc6                = qkr1*xr + qkr2*yr + qkr3*zr;
-
-    float gl0                = ci*ck;
-    float gl1                = ck*sc3 - ci*sc4;
-    float gl2                = ci*sc6 + ck*sc5 - sc3*sc4;
-    float gl3                = sc3*sc6 - sc4*sc5;
-    float gl4                = sc5*sc6;
-
-#ifdef APPLY_SCALE
-    //forceTorqueEnergy->w    += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3.0f*rr3*rr1*rr1))*(gl2+gl7+gl8) + (bn3-offset*(15.0f*rr3*rr3*rr1))*(gl3+gl5) + (bn4-offset*(105.0f*rr3*rr3*rr3))*gl4);
-    *energy                 += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3.0f*rr3*rr1*rr1))*(gl2+gl7+gl8) + (bn3-offset*(15.0f*rr3*rr3*rr1))*(gl3+gl5) + (bn4-offset*(105.0f*rr3*rr3*rr3))*gl4);
-#else
-    //forceTorqueEnergy->w    += bn1*(gl1+gl6) + bn2*(gl2+gl7+gl8) + bn3*(gl3+gl5) + bn4*gl4;
-    *energy                 += forceFactor*(bn1*(gl1+gl6) + bn2*(gl2+gl7+gl8) + bn3*(gl3+gl5) + bn4*gl4);
-    
-#endif
-
-    float gf1                = bn1*gl0 + bn2*(gl1+gl6) + bn3*(gl2+gl7+gl8) + bn4*(gl3+gl5) + bn5*gl4;
-#ifdef APPLY_SCALE
-          gf1               -= offset*(rr3*gl0 + (3.0f*rr3*rr1*rr1)*(gl1+gl6) + (15.0f*rr3*rr3*rr1)*(gl2+gl7+gl8) + (105.0f*rr3*rr3*rr3)*(gl3+gl5) + (945.0f*rr3*rr3*rr3*rr1*rr1)*gl4);
-#endif
-    ftm21                   += gf1*xr;
-    ftm22                   += gf1*yr;
-    ftm23                   += gf1*zr;
-
-#ifdef APPLY_SCALE
-    float gf2                = -ck*bn1 + sc4*bn2 - sc6*bn3 - offset*(-ck*rr3 + sc4*(3.0f*rr3*rr1*rr1) - sc6*(15.0f*rr3*rr3*rr1));
-#else
-    float gf2                = -ck*bn1 + sc4*bn2 - sc6*bn3;
-#endif
-    ftm21                   += gf2*di1;
-    ftm22                   += gf2*di2;
-    ftm23                   += gf2*di3;
-
-#ifdef APPLY_SCALE
-    float gf5                = 2.0f*(-ck*bn2+sc4*bn3-sc6*bn4 - offset*(-ck*(3.0f*rr3*rr1*rr1)+sc4*(15.0f*rr3*rr3*rr1)-sc6*(105.0f*rr3*rr3*rr3)));
-#else
-    float gf5                = 2.0f*(-ck*bn2+sc4*bn3-sc6*bn4);
-#endif
-    ftm21                   += gf5*qir1;
-    ftm22                   += gf5*qir2;
-    ftm23                   += gf5*qir3;
-
-#ifdef APPLY_SCALE
-    float gf3                = ci*bn1 + sc3*bn2 + sc5*bn3 - offset*(ci*rr3 + sc3*(3.0f*rr3*rr1*rr1) + sc5*(15.0f*rr3*rr3*rr1));
-#else
-    float gf3                = ci*bn1 + sc3*bn2 + sc5*bn3;
-#endif
-    ftm21                   += gf3*dk1;
-    ftm22                   += gf3*dk2;
-    ftm23                   += gf3*dk3;
-
-#ifdef APPLY_SCALE
-    float gf6                = 2.0f*(-ci*bn2-sc3*bn3-sc5*bn4 - offset*(-ci*(3.0f*rr3*rr1*rr1)-sc3*(15.0f*rr3*rr3*rr1)-sc5*(105.0f*rr3*rr3*rr3)));
-#else
-    float gf6                = 2.0f*(-ci*bn2-sc3*bn3-sc5*bn4);
-#endif
-
-    ftm21                   += gf6*qkr1;
-    ftm22                   += gf6*qkr2;
-    ftm23                   += gf6*qkr3;
-
-    force[0]                 = ftm21;
-    force[1]                 = ftm22;
-    force[2]                 = ftm23;
-/*
-    if( forceFactor == 1.0f ){
-        atomJ.force[0]      -= ftm21;
-        atomJ.force[1]      -= ftm22;
-        atomJ.force[2]      -= ftm23;
-    }
-    atomI.force[0]      += ftm21;
-    atomI.force[1]      += ftm22;
-    atomI.force[2]      += ftm23;
-*/
-    return;
-
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostaticF2P.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostaticF2P.h
-
-static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnF2, _kernel )( 
-                                        PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
-                                        float4 delta, float4 bn, float forceFactor,
-#ifdef APPLY_SCALE
-                                        const float* scalingFactors,
-#endif
-                                        float force[3], float* energy ){
-
-    float xr                    = delta.x;
-    float yr                    = delta.y;
-    float zr                    = delta.z;
-    float rr1                   = delta.w;
-
-    // set the permanent multipole and induced dipole values;
-
-    float ci                    = atomI.q;
-
-    float di1                   = atomI.labFrameDipole[0];
-    float di2                   = atomI.labFrameDipole[1];
-    float di3                   = atomI.labFrameDipole[2];
-
-    float qi1                   = atomI.labFrameQuadrupole[0];
-    float qi2                   = atomI.labFrameQuadrupole[1];
-    float qi3                   = atomI.labFrameQuadrupole[2];
-    float qi5                   = atomI.labFrameQuadrupole[3];
-    float qi6                   = atomI.labFrameQuadrupole[4];
-//    float qi9                   = atomI.labFrameQuadrupole[5];
-    float qi9                   = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
-
-    float bn1                   = bn.x;
-    float bn2                   = bn.y;
-    float bn3                   = bn.z;
-    float bn4                   = bn.w;
-
-    float damp                  = atomI.damp*atomJ.damp;
-    if( damp != 0.0f ){
-        float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
-        float ratio  = 1.0f/(rr1*damp);
-        damp         = -pgamma*ratio*ratio*ratio;
-        damp         = damp < -50.0f ? 0.0f : damp;
-    }
-
-    float scale5                = (damp == 0.0f) ? 1.0f : (1.0f - (1.0f-damp)*expf(damp));
-    float rr5                   = rr1*rr1;
-          rr5                   = 3.0f*rr1*rr5*rr5;
-#ifdef APPLY_SCALE
-    float psc5                  = rr5*(1.0f - scale5*scalingFactors[PScaleIndex]);
-    float dsc5                  = rr5*(1.0f - scale5*scalingFactors[DScaleIndex]);
-    float usc5                  = rr5*(1.0f - scale5*scalingFactors[UScaleIndex]);
-#else
-    float psc5                  = rr5*(1.0f - scale5);
-#endif
-
-    float qiuk1                 = qi1*atomJ.inducedDipole[0]  + qi2*atomJ.inducedDipole[1]  + qi3*atomJ.inducedDipole[2];
-    float qiukp1                = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
-    float ftm21                 = -bn2*(qiuk1+qiukp1);
-#ifdef APPLY_SCALE
-          ftm21                += qiuk1*psc5 + qiukp1*dsc5;
-#else
-          ftm21                += (qiuk1 + qiukp1)*psc5;
-#endif
-
-    float qiuk2                 = qi2*atomJ.inducedDipole[0]  + qi5*atomJ.inducedDipole[1]  + qi6*atomJ.inducedDipole[2];
-    float qiukp2                = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
-    float ftm22                 = -bn2*(qiuk2+qiukp2);
-#ifdef APPLY_SCALE
-          ftm22                += ((qiuk2)*psc5 + (qiukp2)*dsc5);
-#else
-          ftm22                += (qiuk2 + qiukp2)*psc5;
-#endif
-
-    float qiuk3                 = qi3*atomJ.inducedDipole[0]  + qi6*atomJ.inducedDipole[1]  + qi9*atomJ.inducedDipole[2];
-    float qiukp3                = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
-    float ftm23                 = -bn2*(qiuk3+qiukp3);
-#ifdef APPLY_SCALE
-          ftm23                += ((qiuk3)*psc5 + (qiukp3)*dsc5);
-#else
-          ftm23                += (qiuk3 + qiukp3)*psc5;
-#endif
-
-    float expdamp               = expf(damp);
-    float scale3                = (damp == 0.0f) ? 1.0f : (1.0f - expdamp);
-    float rr3                   = rr1*rr1*rr1;
-
-#ifdef APPLY_SCALE
-    float psc3                  = rr3*(1.0f - scale3*scalingFactors[PScaleIndex]);
-    float dsc3                  = rr3*(1.0f - scale3*scalingFactors[DScaleIndex]);
-    float usc3                  = rr3*(1.0f - scale3*scalingFactors[UScaleIndex]);
-#else
-    float psc3                  = rr3*(1.0f - scale3);
-#endif
-
-    float scale7                = (damp == 0.0f) ? 1.0f : (1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp);
-
-#ifdef APPLY_SCALE
-    float psc7                  = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[PScaleIndex]);
-    float dsc7                  = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[DScaleIndex]);
-#else
-    float psc7                  = (15.0f*rr3*rr3*rr1)*(1.0f - scale7);
-#endif
-
-    float qir1                  = qi1*xr + qi2*yr + qi3*zr;
-    float qir2                  = qi2*xr + qi5*yr + qi6*zr;
-    float qir3                  = qi3*xr + qi6*yr + qi9*zr;
-
-    float sc3                   = di1*xr  + di2*yr  + di3*zr;
-    float sc5                   = qir1*xr + qir2*yr + qir3*zr;
-    float gfi3                  = ci*bn1  + sc3*bn2 + sc5*bn3;
-
-    float prefactor1;
-    prefactor1                  = 0.5f*(ci*psc3 + sc3*psc5 + sc5*psc7 - gfi3);
-    ftm21                      -= prefactor1*atomJ.inducedDipole[0];
-    ftm22                      -= prefactor1*atomJ.inducedDipole[1];
-    ftm23                      -= prefactor1*atomJ.inducedDipole[2];
-
-#ifdef APPLY_SCALE
-    prefactor1                  = 0.5f*(ci*dsc3 + sc3*dsc5 + sc5*dsc7 - gfi3);
-#endif
-    ftm21                      -= prefactor1*atomJ.inducedDipoleP[0];
-    ftm22                      -= prefactor1*atomJ.inducedDipoleP[1];
-    ftm23                      -= prefactor1*atomJ.inducedDipoleP[2];
-
-    float sci4                  = atomJ.inducedDipole[0]*xr  + atomJ.inducedDipole[1]*yr  + atomJ.inducedDipole[2]*zr;
-    //forceTorqueEnergy->w       += 0.5f*((psc3-bn1)*(ci*sci4) + (psc5-bn2)*(sc3*sci4) + (psc7-bn3)*(sci4*sc5));
-    *energy                    += forceFactor*0.5f*sci4*((psc3-bn1)*ci + (psc5-bn2)*sc3 + (psc7-bn3)*sc5);
-
-    float scip4                 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
-    if( cAmoebaSim.polarizationType == 0 ){
-
-#ifdef APPLY_SCALE
-        prefactor1              = 0.5f*( bn2 - usc5 );
-#else
-        prefactor1              = 0.5f*( bn2 - psc5 );
-#endif
-        ftm21                  += prefactor1*( (sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0]) );
-        ftm22                  += prefactor1*( (sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1]) );
-        ftm23                  += prefactor1*( (sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2]) );
-   }
-
-#ifdef APPLY_SCALE
-    prefactor1                  = 0.5f*( bn2*(sci4+scip4) - (sci4*psc5+scip4*dsc5) ); 
-#else
-    sci4                       += scip4;
-    prefactor1                  = 0.5f*sci4*( bn2 - psc5 ); 
-#endif
-
-    ftm21                      += prefactor1*di1;
-    ftm22                      += prefactor1*di2;
-    ftm23                      += prefactor1*di3;
-
-#ifdef APPLY_SCALE
-    float gfi5                  = bn3*(sci4+scip4) - (sci4*psc7+scip4*dsc7);
-#else
-    float gfi5                  = sci4*(bn3 - psc7);
-#endif
-    ftm21                      += gfi5*qir1;
-    ftm22                      += gfi5*qir2;
-    ftm23                      += gfi5*qir3;
-
-    float sci7                  = qir1*atomJ.inducedDipole[0]  + qir2*atomJ.inducedDipole[1]  + qir3*atomJ.inducedDipole[2];
-    //forceTorqueEnergy->w       += (bn2-psc5)*sci7;
-    *energy                    += forceFactor*(bn2-psc5)*sci7;
-    float scip7                 = qir1*atomJ.inducedDipoleP[0] + qir2*atomJ.inducedDipoleP[1] + qir3*atomJ.inducedDipoleP[2];
-
-#ifdef APPLY_SCALE
-    float gli1                  = -ci*sci4;
-    float gli2                  = -sc3*sci4 + 2.0f*sci7;
-    float gli3                  = -sci4*sc5;
-    float glip1                 = -ci*scip4;
-    float glip2                 = -sc3*scip4 + 2.0f*scip7;
-    float glip3                 = -scip4*sc5;
-#else
-    float gli1                  = -ci*sci4;
-    float gli2                  = -sc3*sci4 + 2.0f*(sci7 + scip7);
-    float gli3                  = -sci4*sc5;
-#endif
-
-#ifdef APPLY_SCALE
-    float gfi1                  = (bn2*(gli1+glip1) + bn3*(gli2+glip2) + bn4*(gli3+glip3));
-    gfi1                       -= (rr1*rr1)*( 3.0f*(gli1*psc3 + glip1*dsc3) + 5.0f*(gli2*psc5 + glip2*dsc5 ) + 7.0f*(gli3*psc7+glip3*dsc7) );
-#else
-    float gfi1                  = bn2*gli1 + bn3*gli2 + bn4*gli3;
-    gfi1                       -= (rr1*rr1)*( 3.0f*gli1*psc3 + 5.0f*gli2*psc5 + 7.0f*gli3*psc7);
-#endif
-    gfi1                       *= 0.5f;
-    ftm21                      += gfi1*xr;
-    ftm22                      += gfi1*yr;
-    ftm23                      += gfi1*zr;
-
-    if( damp != 0.0f ){
-
-        float expdamp = expf(damp);
-        float temp3   = -1.5f*damp*expdamp*rr1*rr1;
-        float temp5   = -damp;
-        float temp7   = -0.2f - 0.6f*damp;
-
-        float ddsc31  = temp3*xr;
-        float ddsc32  = temp3*yr;
-        float ddsc33  = temp3*zr;
-
-        float ddsc51  = temp5*ddsc31;
-        float ddsc52  = temp5*ddsc32;
-        float ddsc53  = temp5*ddsc33;
-
-        float ddsc71  = temp7*ddsc51;
-        float ddsc72  = temp7*ddsc52;
-        float ddsc73  = temp7*ddsc53;
-
-        float rr3     = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-        temp3         = (gli1*scalingFactors[PScaleIndex] + glip1*scalingFactors[DScaleIndex]);
-        temp5         = (3.0f*rr1*rr1)*(gli2*scalingFactors[PScaleIndex] + glip2*scalingFactors[DScaleIndex]);
-        temp7         = (15.0f*rr3*rr1)*(gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
-#else
-        temp3         = gli1;
-        temp5         = (3.0f*rr1*rr1)*gli2;
-        temp7         = (15.0f*rr3*rr1)*gli3;
-#endif
-        ftm21        -= rr3*(temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
-        ftm22        -= rr3*(temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
-        ftm23        -= rr3*(temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
-    }
-
-//K
-    float qk1                   = atomJ.labFrameQuadrupole[0];
-    float qk2                   = atomJ.labFrameQuadrupole[1];
-    float qk3                   = atomJ.labFrameQuadrupole[2];
-    float qk5                   = atomJ.labFrameQuadrupole[3];
-    float qk6                   = atomJ.labFrameQuadrupole[4];
-    //float qk9                   = atomJ.labFrameQuadrupole[5];
-    float qk9                   = -(qk1 + qk5);
-
-    float qkui1                 = qk1*atomI.inducedDipole[0]  + qk2*atomI.inducedDipole[1]  + qk3*atomI.inducedDipole[2];
-    float qkuip1                = qk1*atomI.inducedDipoleP[0] + qk2*atomI.inducedDipoleP[1] + qk3*atomI.inducedDipoleP[2];
-          ftm21                += bn2*(qkui1+qkuip1);
-#ifdef APPLY_SCALE
-          ftm21                -= (qkui1*psc5 + qkuip1*dsc5);
-#else
-          ftm21                -= (qkui1 + qkuip1)*psc5;
-#endif
-
-    float qkui2                 = qk2*atomI.inducedDipole[0]  + qk5*atomI.inducedDipole[1]  + qk6*atomI.inducedDipole[2];
-    float qkuip2                = qk2*atomI.inducedDipoleP[0] + qk5*atomI.inducedDipoleP[1] + qk6*atomI.inducedDipoleP[2];
-          ftm22                += bn2*(qkui2+qkuip2);
-#ifdef APPLY_SCALE
-          ftm22                -= ((qkui2)*psc5 + (qkuip2)*dsc5);
-#else
-          ftm22                -= (qkui2 + qkuip2)*psc5;
-#endif
-
-    float qkui3                 = qk3*atomI.inducedDipole[0]  + qk6*atomI.inducedDipole[1]  + qk9*atomI.inducedDipole[2];
-    float qkuip3                = qk3*atomI.inducedDipoleP[0] + qk6*atomI.inducedDipoleP[1] + qk9*atomI.inducedDipoleP[2];
-          ftm23                += bn2*(qkui3+qkuip3);
-#ifdef APPLY_SCALE
-          ftm23                -= ((qkui3)*psc5 + (qkuip3)*dsc5);
-#else
-          ftm23                -= (qkui3 + qkuip3)*psc5;
-#endif
-
-
-    float qkr1                  = qk1*xr + qk2*yr + qk3*zr;
-    float qkr2                  = qk2*xr + qk5*yr + qk6*zr;
-    float qkr3                  = qk3*xr + qk6*yr + qk9*zr;
-
-    float dk1                   = atomJ.labFrameDipole[0];
-    float dk2                   = atomJ.labFrameDipole[1];
-    float dk3                   = atomJ.labFrameDipole[2];
-
-    float sc4                   =  dk1*xr  +  dk2*yr +  dk3*zr;
-    float sc6                   = qkr1*xr  + qkr2*yr + qkr3*zr;
-
-    float ck                    = atomJ.q;
-    float gfi2                  = (-ck*bn1 + sc4*bn2 - sc6*bn3);
-
-    prefactor1                  = 0.5f*(ck*psc3 - sc4*psc5 + sc6*psc7 + gfi2);
-    ftm21                      += prefactor1*atomI.inducedDipole[0];
-    ftm22                      += prefactor1*atomI.inducedDipole[1];
-    ftm23                      += prefactor1*atomI.inducedDipole[2];
-
-#ifdef APPLY_SCALE
-    prefactor1                  = 0.5f*(ck*dsc3 - sc4*dsc5 + sc6*dsc7 + gfi2);
-#endif
-    ftm21                      += prefactor1*atomI.inducedDipoleP[0];
-    ftm22                      += prefactor1*atomI.inducedDipoleP[1];
-    ftm23                      += prefactor1*atomI.inducedDipoleP[2];
-
-    float sci3                  = atomI.inducedDipole[0]*xr  + atomI.inducedDipole[1]*yr  + atomI.inducedDipole[2]*zr;
-    *energy                    += forceFactor*0.5f*sci3*( ck*(bn1-psc3) - sc4*(bn2-psc5) + sc6*(bn3-psc7) );
-    float scip3                 = atomI.inducedDipoleP[0]*xr + atomI.inducedDipoleP[1]*yr + atomI.inducedDipoleP[2]*zr;
-
-    if( cAmoebaSim.polarizationType == 0 ){
-#ifdef APPLY_SCALE
-        prefactor1              = 0.5f*( bn2 - usc5 );
-#else
-        prefactor1              = 0.5f*( bn2 - psc5 );
-#endif
-
-        ftm21                  += prefactor1*( sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0] );
-        ftm22                  += prefactor1*( sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1] );
-        ftm23                  += prefactor1*( sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2] );
-    }
-
-    float sci34;
-    if( cAmoebaSim.polarizationType == 0 ){
-        float sci4              = atomJ.inducedDipole[0]*xr  + atomJ.inducedDipole[1]*yr  + atomJ.inducedDipole[2]*zr;
-        float scip4             = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
-        sci34                   = (sci3*scip4+scip3*sci4);
-   
-#ifdef APPLY_SCALE
-        gfi1                    = sci34*(usc5*(5.0f*rr1*rr1) -bn3 );
-#else
-        gfi1                    = sci34*(psc5*(5.0f*rr1*rr1) -bn3 );
-#endif
-
-    } else {
-        gfi1                    = 0.0f;
-    }
-
-#ifdef APPLY_SCALE
-    prefactor1                  = 0.5f*( bn2*(sci3+scip3) - (sci3*psc5+scip3*dsc5) );
-#else
-    sci3                       += scip3;
-    prefactor1                  = 0.5f*sci3*( bn2 - psc5 );
-#endif
-    ftm21                      += prefactor1*dk1;
-    ftm22                      += prefactor1*dk2;
-    ftm23                      += prefactor1*dk3;
-
-#ifdef APPLY_SCALE
-    float gfi6                  = -bn3*(sci3+scip3) + (sci3*psc7+scip3*dsc7);
-#else
-    float gfi6                  = sci3*( psc7 - bn3);
-#endif
-    ftm21                      += gfi6*qkr1;
-    ftm22                      += gfi6*qkr2;
-    ftm23                      += gfi6*qkr3;
-
-    float sci1                  = atomI.inducedDipole[0]*dk1 + atomI.inducedDipole[1]*dk2 + atomI.inducedDipole[2]*dk3 + di1*atomJ.inducedDipole[0] + di2*atomJ.inducedDipole[1] + di3*atomJ.inducedDipole[2];
-    //forceTorqueEnergy->w       += 0.5f*( sci1*(bn1-psc3) );
-    *energy                    += forceFactor*0.5f*( sci1*(bn1-psc3) );
-
-    float sci8                  = qkr1*atomI.inducedDipole[0] + qkr2*atomI.inducedDipole[1] + qkr3*atomI.inducedDipole[2];
-    //forceTorqueEnergy->w       += sci8*(bn2-psc5);
-    *energy                    -= forceFactor*sci8*(bn2-psc5);
-    float scip1                 = atomI.inducedDipoleP[0]*dk1 + atomI.inducedDipoleP[1]*dk2 + atomI.inducedDipoleP[2]*dk3 + di1*atomJ.inducedDipoleP[0] + di2*atomJ.inducedDipoleP[1] + di3*atomJ.inducedDipoleP[2];
-#ifndef APPLY_SCALE
-        sci1                   += scip1;
-#endif
-
-    float scip2                 = atomI.inducedDipole[0]*atomJ.inducedDipoleP[0] +
-                                  atomI.inducedDipole[1]*atomJ.inducedDipoleP[1] +
-                                  atomI.inducedDipole[2]*atomJ.inducedDipoleP[2] +
-                                  atomJ.inducedDipole[0]*atomI.inducedDipoleP[0] +
-                                  atomJ.inducedDipole[1]*atomI.inducedDipoleP[1] +
-                                  atomJ.inducedDipole[2]*atomI.inducedDipoleP[2];
-
-    float scip8                 = qkr1*atomI.inducedDipoleP[0] + qkr2*atomI.inducedDipoleP[1] + qkr3*atomI.inducedDipoleP[2];
-#ifndef APPLY_SCALE
-          sci8                 += scip8;
-#endif
-
-           gli1                 = ck*sci3 + sci1;
-           gli2                 = -(sci3*sc4 + 2.0f*sci8);
-           gli3                 = sci3*sc6;
-#ifdef APPLY_SCALE
-          glip1                 = ck*scip3 + scip1;
-          glip2                 = -(scip3*sc4 + 2.0f*scip8);
-          glip3                 = scip3*sc6;
-#endif
-
-
-#ifdef APPLY_SCALE
-    gfi1                       += (bn2*(gli1+glip1) + bn3*(gli2+glip2) + bn4*(gli3+glip3));
-    gfi1                       -= (rr1*rr1)*( 3.0f*(gli1*psc3 + glip1*dsc3) + 5.0f*(gli2*psc5 + glip2*dsc5 ) + 7.0f*(gli3*psc7+glip3*dsc7) );
-#else
-    gfi1                       += (bn2*gli1 + bn3*gli2 + bn4*gli3);
-    gfi1                       -= (rr1*rr1)*( 3.0f*gli1*psc3 + 5.0f*gli2*psc5 + 7.0f*gli3*psc7 );
-#endif
-    
-    if( cAmoebaSim.polarizationType == 0 ){
-#ifdef APPLY_SCALE
-        gfi1                       += scip2*(bn2 - (3.0f*rr1*rr1)*usc3);
-#else
-        gfi1                       += scip2*(bn2 - (3.0f*rr1*rr1)*psc3);
-#endif
-
-    }
-    gfi1                       *= 0.5f;
-
-    ftm21                       += gfi1*xr;
-    ftm22                       += gfi1*yr;
-    ftm23                       += gfi1*zr;
-
-    if( damp != 0.0f ){
-
-        float expdamp = expf(damp);
-        float temp3   = -1.5f*damp*expdamp*rr1*rr1;
-        float temp5   = -damp;
-        float temp7   = -0.2f - 0.6f*damp;
-
-        float ddsc31  = temp3*xr;
-        float ddsc32  = temp3*yr;
-        float ddsc33  = temp3*zr;
-
-        float ddsc51  = temp5*ddsc31;
-        float ddsc52  = temp5*ddsc32;
-        float ddsc53  = temp5*ddsc33;
-
-        float ddsc71  = temp7*ddsc51;
-        float ddsc72  = temp7*ddsc52;
-        float ddsc73  = temp7*ddsc53;
-
-        float rr3     = rr1*rr1*rr1;
-
-#ifdef APPLY_SCALE
-        temp3         =                  gli1*scalingFactors[PScaleIndex] + glip1*scalingFactors[DScaleIndex];
-        temp5         = (3.0f*rr1*rr1)*( gli2*scalingFactors[PScaleIndex] + glip2*scalingFactors[DScaleIndex]);
-        temp7         = (15.0f*rr3*rr1)*(gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
-#else
-        temp3         = gli1;
-        temp5         = (3.0f*rr1*rr1)*gli2;
-        temp7         = (15.0f*rr3*rr1)*(gli3);
-#endif
-
-        ftm21        -= rr3*(temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
-        ftm22        -= rr3*(temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
-        ftm23        -= rr3*(temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
-
-        if( cAmoebaSim.polarizationType == 0 ){
-#ifdef APPLY_SCALE
-            temp3   =  scalingFactors[UScaleIndex]*scip2;
-            temp5   = -(3.0f*rr1*rr1)*scalingFactors[UScaleIndex]*sci34;
-#else
-            temp3   =  scip2;
-            temp5   = -(3.0f*rr1*rr1)*sci34;
-#endif
-            ftm21  -= rr3*(temp3*ddsc31 + temp5*ddsc51);
-            ftm22  -= rr3*(temp3*ddsc32 + temp5*ddsc52);
-            ftm23  -= rr3*(temp3*ddsc33 + temp5*ddsc53);
-        }
-    }
-
-    force[0] += ftm21;
-    force[1] += ftm22;
-    force[2] += ftm23;
-/*
-    if( forceFactor == 1.0f ){
-        atomJ.force[0]  -= ftm21;
-        atomJ.force[1]  -= ftm22;
-        atomJ.force[2]  -= ftm23;
-    }   
-    atomI.force[0]      += ftm21;
-    atomI.force[1]      += ftm22;
-    atomI.force[2]      += ftm23;
-*/
-/*
-    forceTorqueEnergy->x += ftm21;
-    forceTorqueEnergy->y += ftm22;
-    forceTorqueEnergy->z += ftm23;
-*/
-
-    return;
-
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostaticT1.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostaticT1.h
-
-static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnT1, _kernel )( 
-                                        PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
-                                        const float4 delta, const float4 bn
-#ifdef APPLY_SCALE
-                                        , const float* scalingFactors
-#endif
-                                        ){
-
-    float xr                    = delta.x;
-    float yr                    = delta.y;
-    float zr                    = delta.z;
-#ifdef APPLY_SCALE
-    float rr1                   = delta.w;
-#endif
-
-    // set the permanent multipole and induced dipole values;
-
-    float di1                   = atomI.labFrameDipole[0];
-    float di2                   = atomI.labFrameDipole[1];
-    float di3                   = atomI.labFrameDipole[2];
-
-    float qi1                   = atomI.labFrameQuadrupole[0];
-    float qi2                   = atomI.labFrameQuadrupole[1];
-    float qi3                   = atomI.labFrameQuadrupole[2];
-    float qi5                   = atomI.labFrameQuadrupole[3];
-    float qi6                   = atomI.labFrameQuadrupole[4];
-    //float qi9                   = atomI.labFrameQuadrupole[5];
-    float qi9                   = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
-
-    float ck                    = atomJ.q;
-
-    float dk1                   = atomJ.labFrameDipole[0];
-    float dk2                   = atomJ.labFrameDipole[1];
-    float dk3                   = atomJ.labFrameDipole[2];
-
-    float qk1                   = atomJ.labFrameQuadrupole[0];
-    float qk2                   = atomJ.labFrameQuadrupole[1];
-    float qk3                   = atomJ.labFrameQuadrupole[2];
-    float qk5                   = atomJ.labFrameQuadrupole[3];
-    float qk6                   = atomJ.labFrameQuadrupole[4];
-    //float qk9                   = atomJ.labFrameQuadrupole[5];
-    float qk9                   = -(atomJ.labFrameQuadrupole[0] + atomJ.labFrameQuadrupole[3]);
-
-    float bn1                   = bn.x;
-    float bn2                   = bn.y;
-    float bn3                   = bn.z;
-    float bn4                   = bn.w;
-
-    // apply Thole polarization damping to scale factors
-
-#ifdef APPLY_SCALE
-    float rr2                   = rr1*rr1;
-    float rr3                   = rr1*rr2;
-    float rr5                   = 3.0f*rr3*rr2;
-    float rr7                   = 5.0f*rr5*rr2;
-    float rr9                   = 7.0f*rr7*rr2;
-
-    float scale                 = 1.0f-scalingFactors[MScaleIndex];
-    float prefactor             = scale*rr3 - bn1;
-#else
-    float prefactor             = -bn1;
-#endif
-    float dixdk1                = di2*dk3 - di3*dk2;
-    float ttm21                 = prefactor*dixdk1;
-
-    float dixdk2                = di3*dk1 - di1*dk3;
-    float ttm22                 = prefactor*dixdk2;
-
-    float dixdk3                = di1*dk2 - di2*dk1;
-    float ttm23                 = prefactor*dixdk3;
-
-    float qir1                  = qi1*xr + qi2*yr + qi3*zr;
-    float qir2                  = qi2*xr + qi5*yr + qi6*zr;
-    float qir3                  = qi3*xr + qi6*yr + qi9*zr;
-
-    float qkr1                  = qk1*xr + qk2*yr + qk3*zr;
-    float qkr2                  = qk2*xr + qk5*yr + qk6*zr;
-    float qkr3                  = qk3*xr + qk6*yr + qk9*zr;
-
-    float qiqkr1                = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
-    float qiqkr2                = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
-    float qiqkr3                = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
-
-    float rxqikr1               = yr*qiqkr3 - zr*qiqkr2;
-    float qkrxqir1              = qkr2*qir3 - qkr3*qir2;
-#ifdef APPLY_SCALE
-    prefactor                   = 4.0f*(bn3 - scale*rr7);
-#else
-    prefactor                   = 4.0f*bn3;
-#endif
-    ttm21                      -= prefactor*(rxqikr1+qkrxqir1);
-
-    float rxqikr2               = zr*qiqkr1 - xr*qiqkr3;
-    float qkrxqir2              = qkr3*qir1 - qkr1*qir3;
-    ttm22                      -= prefactor*(rxqikr2+qkrxqir2);
-
-    float rxqikr3               = xr*qiqkr2 - yr*qiqkr1;
-    float qkrxqir3              = qkr1*qir2 - qkr2*qir1;
-    ttm23                      -= prefactor*(rxqikr3+qkrxqir3);
-
-    float qidk1                 = qi1*dk1 + qi2*dk2 + qi3*dk3;
-    float qidk2                 = qi2*dk1 + qi5*dk2 + qi6*dk3;
-    float qidk3                 = qi3*dk1 + qi6*dk2 + qi9*dk3;
-
-    float dixqkr1               = di2*qkr3 - di3*qkr2;
-    float dkxqir1               = dk2*qir3 - dk3*qir2;
-    float rxqidk1               = yr*qidk3 - zr*qidk2;
-    float qixqk1                = qi2*qk3 + qi5*qk6 + qi6*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk6;
-#ifdef APPLY_SCALE
-    prefactor                   = 2.0f*(bn2 - scale*rr5);
-#else
-    prefactor                   = 2.0f*bn2;
-#endif
-    ttm21                      += prefactor*(dixqkr1+dkxqir1+rxqidk1-2.0f*qixqk1);
- 
-    float dixqkr2               = di3*qkr1 - di1*qkr3;
-    float dkxqir2               = dk3*qir1 - dk1*qir3;
-    float rxqidk2               = zr*qidk1 - xr*qidk3;
-    float qixqk2                = qi3*qk1 + qi6*qk2 + qi9*qk3 - qi1*qk3 - qi2*qk6 - qi3*qk9;
-    ttm22                      += prefactor*(dixqkr2+dkxqir2+rxqidk2-2.0f*qixqk2);
-
-    float dixqkr3               = di1*qkr2 - di2*qkr1;
-    float dkxqir3               = dk1*qir2 - dk2*qir1;
-    float rxqidk3               = xr*qidk2 - yr*qidk1;
-    float qixqk3                = qi1*qk2 + qi2*qk5 + qi3*qk6 - qi2*qk1 - qi5*qk2 - qi6*qk3;
-    ttm23                      += prefactor*(dixqkr3+dkxqir3+rxqidk3-2.0f*qixqk3);
-
-    float sc4                   = dk1*xr + dk2*yr + dk3*zr;
-    float sc6                   = qkr1*xr + qkr2*yr + qkr3*zr;
-
-    float gf2                   = -ck*bn1 + sc4*bn2 - sc6*bn3;
-#ifdef APPLY_SCALE
-    float gfr2                  = -ck*rr3 + sc4*rr5 - sc6*rr7;
-    prefactor                   = (gf2 - scale*gfr2);
-#else
-    prefactor                   = gf2;
-#endif
-    ttm21                      += prefactor*(di2*zr - di3*yr);
-    ttm22                      += prefactor*(di3*xr - di1*zr);
-    ttm23                      += prefactor*(di1*yr - di2*xr);
-
-    float gf5                   = (-ck*bn2+sc4*bn3-sc6*bn4);
-#ifdef APPLY_SCALE
-    float gfr5                  = (-ck*rr5+sc4*rr7-sc6*rr9); 
-    prefactor                   = 2.0f*(gf5 - scale*gfr5);
-#else
-    prefactor                   = 2.0f*gf5;
-#endif
-
-    float rxqir1                = yr*qir3 - zr*qir2;
-    float rxqir2                = zr*qir1 - xr*qir3;
-    float rxqir3                = xr*qir2 - yr*qir1;
-    ttm21                      -= prefactor*rxqir1; 
-    ttm22                      -= prefactor*rxqir2;
-    ttm23                      -= prefactor*rxqir3;
-
-    atomI.torque[0]            += ttm21;
-    atomI.torque[1]            += ttm22;
-    atomI.torque[2]            += ttm23;
-/*
-
-    torque[0]             = ttm21;
-    torque[1]             = ttm22;
-    torque[2]             = ttm23;
-*/
-
-    return;
-
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostaticT2.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostaticT2.h
-
-static __device__ void SUB_METHOD_NAME( calculatePmeDirectElectrostaticPairIxnT2, _kernel)( 
-                                        PmeDirectElectrostaticParticle& atomI, PmeDirectElectrostaticParticle& atomJ,
-                                        const float4 delta, const float4 bn
-#ifdef APPLY_SCALE
-                                        , const float* scalingFactors
-#endif
-                                        ){
-
-    float xr                    = delta.x;
-    float yr                    = delta.y;
-    float zr                    = delta.z;
-    float rr1                   = delta.w;
-
-    // set the permanent multipole and induced dipole values;
-
-    float di1                   = atomI.labFrameDipole[0];
-    float di2                   = atomI.labFrameDipole[1];
-    float di3                   = atomI.labFrameDipole[2];
-
-    float qi1                   = atomI.labFrameQuadrupole[0];
-    float qi2                   = atomI.labFrameQuadrupole[1];
-    float qi3                   = atomI.labFrameQuadrupole[2];
-    float qi5                   = atomI.labFrameQuadrupole[3];
-    float qi6                   = atomI.labFrameQuadrupole[4];
-    //float qi9                   = atomI.labFrameQuadrupole[5];
-    float qi9                   = -(atomI.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[3]);
-
-    float bn1                   = bn.x;
-    float bn2                   = bn.y;
-    float bn3                   = bn.z;
-
-    // apply Thole polarization damping to scale factors
-
-    float scale3                = 1.0f;
-    float scale5                = 1.0f;
-    float scale7                = 1.0f;
-
-    float damp                  = atomI.damp*atomJ.damp;
-    if( damp != 0.0f ){
-        float pgamma  = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
-        float ratio   = 1.0f/(rr1*damp);
-            damp      = -pgamma*ratio*ratio*ratio;
-        if( damp > -50.0f ){
-            float expdamp    = expf(damp);
-            scale3           = 1.0f - expdamp;
-            scale5           = 1.0f - (1.0f-damp)*expdamp;
-            scale7           = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
-        }
-    }
-
-
-    float rr3                   = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-    float dsc3                  = rr3*(1.0f - scale3*scalingFactors[DScaleIndex]);
-    float dsc5                  = (3.0f*rr3*rr1*rr1)* (1.0f - scale5*scalingFactors[DScaleIndex]);
-    float dsc7                  = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[DScaleIndex]);
-
-    float psc3                  = rr3*(1.0f - scale3*scalingFactors[PScaleIndex]);
-    float psc5                  = (3.0f*rr3*rr1*rr1)*(1.0f - scale5*scalingFactors[PScaleIndex]);
-    float psc7                  = (15.0f*rr3*rr3*rr1)*(1.0f - scale7*scalingFactors[PScaleIndex]);
-#else
-    float psc3                  = rr3*(1.0f - scale3);
-    float psc5                  = (3.0f*rr3*rr1*rr1)*(1.0f - scale5);
-    float psc7                  = (15.0f*rr3*rr3*rr1)*(1.0f - scale7);
-#endif
-
-    float prefactor1            = 0.5f*(psc3 - bn1);
-#ifdef APPLY_SCALE
-    float prefactor2            = 0.5f*(dsc3 - bn1);
-#endif
-
-    float dixuk1                = di2*atomJ.inducedDipole[2]  - di3*atomJ.inducedDipole[1];
-    float dixukp1               = di2*atomJ.inducedDipoleP[2] - di3*atomJ.inducedDipoleP[1];
-
-#ifdef APPLY_SCALE
-    float ttm2i1                = prefactor1*dixuk1 + prefactor2*dixukp1;
-#else
-    float ttm2i1                = prefactor1*(dixuk1 + dixukp1);
-#endif
-
-    float dixuk2                = di3*atomJ.inducedDipole[0]  - di1*atomJ.inducedDipole[2];
-    float dixukp2               = di3*atomJ.inducedDipoleP[0] - di1*atomJ.inducedDipoleP[2];
-
-#ifdef APPLY_SCALE
-    float ttm2i2                = prefactor1*dixuk2 + prefactor2*dixukp2;
-#else
-    float ttm2i2                = prefactor1*(dixuk2 + dixukp2);
-#endif
-
-    float dixuk3                = di1*atomJ.inducedDipole[1]  - di2*atomJ.inducedDipole[0];
-    float dixukp3               = di1*atomJ.inducedDipoleP[1] - di2*atomJ.inducedDipoleP[0];
-#ifdef APPLY_SCALE
-    float ttm2i3                = prefactor1*dixuk3 + prefactor2*dixukp3;
-#else
-    float ttm2i3                = prefactor1*(dixuk3 + dixukp3);
-#endif
-
-    float sci4                  = atomJ.inducedDipole[0]*xr  + atomJ.inducedDipole[1]*yr  + atomJ.inducedDipole[2]*zr;
-    float scip4                 = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
-    float gti2                  = bn2*(sci4+scip4);
-#ifdef APPLY_SCALE
-    float gtri2                 = (sci4*psc5+scip4*dsc5);
-#else
-    float gtri2                 = psc5*(sci4+scip4);
-#endif
-    prefactor1                  = 0.5f*(gti2 - gtri2);
-
-    ttm2i1                     += prefactor1*( di2*zr - di3*yr );
-    ttm2i2                     += prefactor1*( di3*xr - di1*zr );
-    ttm2i3                     += prefactor1*( di1*yr - di2*xr );
-
-    float qir1                  = qi1*xr + qi2*yr + qi3*zr;
-    float qir2                  = qi2*xr + qi5*yr + qi6*zr;
-    float qir3                  = qi3*xr + qi6*yr + qi9*zr;
-
-#ifdef APPLY_SCALE
-    prefactor1                  = sci4*psc7 + scip4*dsc7 - bn3*(sci4+scip4);
-#else
-    prefactor1                  = psc7*(sci4+scip4) - bn3*(sci4+scip4);
-#endif
-    ttm2i1                     += prefactor1*( yr*qir3 - zr*qir2 );
-    ttm2i2                     += prefactor1*( zr*qir1 - xr*qir3 );
-    ttm2i3                     += prefactor1*( xr*qir2 - yr*qir1 );
-
-    float qiuk1                 = qi1*atomJ.inducedDipole[0]  + qi2*atomJ.inducedDipole[1]  + qi3*atomJ.inducedDipole[2];
-    float qiuk2                 = qi2*atomJ.inducedDipole[0]  + qi5*atomJ.inducedDipole[1]  + qi6*atomJ.inducedDipole[2];
-    float qiuk3                 = qi3*atomJ.inducedDipole[0]  + qi6*atomJ.inducedDipole[1]  + qi9*atomJ.inducedDipole[2];
-
-    float qiukp1                = qi1*atomJ.inducedDipoleP[0] + qi2*atomJ.inducedDipoleP[1] + qi3*atomJ.inducedDipoleP[2];
-    float qiukp2                = qi2*atomJ.inducedDipoleP[0] + qi5*atomJ.inducedDipoleP[1] + qi6*atomJ.inducedDipoleP[2];
-    float qiukp3                = qi3*atomJ.inducedDipoleP[0] + qi6*atomJ.inducedDipoleP[1] + qi9*atomJ.inducedDipoleP[2];
-
-    prefactor1                  = (bn2 - psc5);
-#ifdef APPLY_SCALE
-    prefactor2                  = (bn2 - dsc5);
-#endif
-    float ukxqir1               = atomJ.inducedDipole[1]*qir3  - atomJ.inducedDipole[2]*qir2;
-    float ukxqirp1              = atomJ.inducedDipoleP[1]*qir3 - atomJ.inducedDipoleP[2]*qir2;
-    float rxqiuk1               = yr*qiuk3  - zr*qiuk2;
-    float rxqiukp1              = yr*qiukp3 - zr*qiukp2;
-
-#ifdef APPLY_SCALE
-    ttm2i1                     += prefactor1*(ukxqir1 + rxqiuk1) + prefactor2*(ukxqirp1 + rxqiukp1);
-#else
-    ttm2i1                     += prefactor1*( ukxqir1 + rxqiuk1 + ukxqirp1 + rxqiukp1 );
-#endif
-
-    float ukxqir2               = atomJ.inducedDipole[2]*qir1  - atomJ.inducedDipole[0]*qir3;
-    float ukxqirp2              = atomJ.inducedDipoleP[2]*qir1 - atomJ.inducedDipoleP[0]*qir3;
-    float rxqiuk2               = zr*qiuk1  - xr*qiuk3;
-    float rxqiukp2              = zr*qiukp1 - xr*qiukp3;
-#ifdef APPLY_SCALE
-    ttm2i2                     += prefactor1*(ukxqir2 + rxqiuk2) + prefactor2*(ukxqirp2 + rxqiukp2);
-#else
-    ttm2i2                     += prefactor1*( ukxqir2 + rxqiuk2 + ukxqirp2 + rxqiukp2 );
-#endif
-
-    float ukxqir3               = atomJ.inducedDipole[0]*qir2  - atomJ.inducedDipole[1]*qir1;
-    float ukxqirp3              = atomJ.inducedDipoleP[0]*qir2 - atomJ.inducedDipoleP[1]*qir1;
-    float rxqiuk3               = xr*qiuk2  - yr*qiuk1;
-    float rxqiukp3              = xr*qiukp2 - yr*qiukp1;
-#ifdef APPLY_SCALE
-    ttm2i3                     += prefactor1*(ukxqir3 + rxqiuk3) + prefactor2*(ukxqirp3 + rxqiukp3);
-#else
-    ttm2i3                     += prefactor1*(ukxqir3 + rxqiuk3 + ukxqirp3 + rxqiukp3 );
-#endif
-
-    atomI.torque[0]            += ttm2i1;
-    atomI.torque[1]            += ttm2i2;
-    atomI.torque[2]            += ttm2i3;
-
-/*
-    torque[0]            += ttm2i1;
-    torque[1]            += ttm2i2;
-    torque[2]            += ttm2i3;
-*/
-
-    return;
-
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "cudaKernels.h"
-#include "amoebaCudaKernels.h"
-#include "kCalculateAmoebaCudaUtilities.h"
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-
-void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-
-void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
-    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* EFieldReciprocal,  float* fieldIn, float* fieldOut )
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // Reduce field
-
-    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
-    //const float term = 0.0f;
-    while (pos < fieldComponents)
-    {   
-
-        // self-term included here
-
-        float totalField = EFieldReciprocal[pos] + term*cAmoebaSim.pLabFrameDipole[pos];
-
-        float* pFt       = fieldIn + pos;
-        unsigned int i   = outputBuffers;
-        while (i >= 4)
-        {   
-            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
-            pFt        += fieldComponents*4;
-            i          -= 4;
-        }   
-
-        if (i >= 2)
-        {   
-            totalField += pFt[0] + pFt[fieldComponents];
-            pFt        += fieldComponents*2;
-            i          -= 2;
-        }   
-
-        if (i > 0)
-        {   
-            totalField += pFt[0];
-        }   
-
-        fieldOut[pos]   = totalField;
-        pos            += gridDim.x * blockDim.x;
-    }   
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int outputBuffers,  float* fieldIn, float* fieldOut )
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // Reduce field
-
-    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
-    //const float term = 0.0;
-    while (pos < fieldComponents)
-    {   
-
-        // self-term included here
-
-        float totalField = term*cAmoebaSim.pLabFrameDipole[pos];
-
-        float* pFt       = fieldIn + pos;
-        unsigned int i   = outputBuffers;
-        while (i >= 4)
-        {   
-            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
-            pFt        += fieldComponents*4;
-            i          -= 4;
-        }   
-
-        if (i >= 2)
-        {   
-            totalField += pFt[0] + pFt[fieldComponents];
-            pFt        += fieldComponents*2;
-            i          -= 2;
-        }   
-
-        if (i > 0)
-        {   
-            totalField += pFt[0];
-        }   
-
-        fieldOut[pos]  += totalField;
-        pos            += gridDim.x * blockDim.x;
-    }   
-}
-
-// reduce psWorkArray_3_1 -> EField
-// reduce psWorkArray_3_2 -> EFieldPolar
-
-static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
-{
-
-    gpuContext gpu = amoebaGpu->gpuContext;
-
-    // E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self
-
-    kReducePmeEFieldPolar_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                                   gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                                   amoebaGpu->psE_Field->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
-    LAUNCHERROR("kReducePmeE_Fields1");
-
-    // E_Field = E_Field (reciprocal) + E_Field (direct) + self
-
-    kReducePmeEField_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                              gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                              amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
-    LAUNCHERROR("kReducePmeE_Fields2");
-}
-
-// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
-
-#undef GK
-#undef INCLUDE_FIXED_FIELD_BUFFERS
-#define INCLUDE_FIXED_FIELD_BUFFERS
-#include "kCalculateAmoebaCudaFixedFieldParticle.h"
-#undef INCLUDE_FIXED_FIELD_BUFFERS
-__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
-    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
-    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
-    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];
-
-    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
-    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
-    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
-}
-
-__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
-                                                            float dscale, float pscale, float4 fields[3]){
-
-    // compute the real space portion of the Ewald summation
-  
-    float xr          = atomJ.x - atomI.x;
-    float yr          = atomJ.y - atomI.y;
-    float zr          = atomJ.z - atomI.z;
-
-    // periodic boundary conditions
-
-    xr               -= floorf(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-    yr               -= floorf(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-    zr               -= floorf(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-
-    float r2          = xr*xr + yr*yr + zr*zr;
-    if( r2 <= cSim.nonbondedCutoffSqr ){
-
-        float r           = sqrtf(r2);
-
-        // calculate the error function damping terms
-
-        float ralpha      = cSim.alphaEwald*r;
-
-        float bn0         = erfcf(ralpha)/r;
-        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
-        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
-        float exp2a       = expf(-(ralpha*ralpha));
-        alsq2n           *= alsq2;
-        float bn1         = (bn0+alsq2n*exp2a)/r2;
-
-        alsq2n           *= alsq2;
-        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;
-
-        alsq2n           *= alsq2;
-        float bn3         = (5.0f*bn2+alsq2n*exp2a)/r2;
-
-        // compute the error function scaled and unscaled terms
-
-        float scale3      = 1.0f;
-        float scale5      = 1.0f;
-        float scale7      = 1.0f;
-        float damp        = atomI.damp*atomJ.damp;
-        if( damp != 0.0f ){
-
-            float ratio  = (r/damp);
-                  ratio  = ratio*ratio*ratio;
-
-            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
-
-                  damp   = -pgamma*ratio;
-
-            if( damp > -50.0f) {
-                float expdamp = expf(damp);
-                scale3        = 1.0f - expdamp;
-                scale5        = 1.0f - expdamp*(1.0f-damp);
-                scale7        = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
-            }
-        }
-        float dsc3        = dscale*scale3;
-        float dsc5        = dscale*scale5;
-        float dsc7        = dscale*scale7;
-
-        float psc3        = pscale*scale3;
-        float psc5        = pscale*scale5;
-        float psc7        = pscale*scale7;
-
-        float r3          = (r*r2);
-        float r5          = (r3*r2);
-        float r7          = (r5*r2);
-        float drr3        = (1.0f-dsc3)/r3;
-        float drr5        = 3.0f * (1.0f-dsc5)/r5;
-        float drr7        = 15.0f * (1.0f-dsc7)/r7;
-
-        float prr3        = (1.0f-psc3) / r3;
-        float prr5        = 3.0f *(1.0f-psc5)/r5;
-        float prr7        = 15.0f*(1.0f-psc7)/r7;
-
-        float dir         = atomI.labFrameDipole_X*xr      + atomI.labFrameDipole_Y*yr      + atomI.labFrameDipole_Z*zr;
-
-        float qix         = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
-        float qiy         = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
-        float qiz         = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
-
-        float qir         = qix*xr + qiy*yr + qiz*zr;
-
-        float dkr         = atomJ.labFrameDipole_X*xr      + atomJ.labFrameDipole_Y*yr      + atomJ.labFrameDipole_Z*zr;
-
-        float qkx         = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
-        float qky         = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
-        float qkz         = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
-
-        float qkr         = qkx*xr + qky*yr + qkz*zr;
-
-        float fim0        = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_X  + 2.0f*bn2*qkx;
-        float fim1        = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_Y  + 2.0f*bn2*qky;
-        float fim2        = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_Z  + 2.0f*bn2*qkz;
-
-        float fkm0        = xr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_X  - 2.0f*bn2*qix;
-        float fkm1        = yr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_Y  - 2.0f*bn2*qiy;
-        float fkm2        = zr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_Z  - 2.0f*bn2*qiz;
-
-        float fid0        = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
-        float fid1        = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
-        float fid2        = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
-
-        float fkd0        = xr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
-        float fkd1        = yr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
-        float fkd2        = zr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
-
-        float fip0        = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
-        float fip1        = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
-        float fip2        = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
-
-        float fkp0        = xr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
-        float fkp1        = yr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
-        float fkp2        = zr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
-
-        // increment the field at each site due to this interaction
-
-        fields[0].x       = fim0 - fid0;
-        fields[1].x       = fim1 - fid1;
-        fields[2].x       = fim2 - fid2;
-
-        fields[0].y       = fkm0 - fkd0;
-        fields[1].y       = fkm1 - fkd1;
-        fields[2].y       = fkm2 - fkd2;
-
-        fields[0].z       = fim0 - fip0;
-        fields[1].z       = fim1 - fip1;
-        fields[2].z       = fim2 - fip2;
-
-        fields[0].w       = fkm0 - fkp0;
-        fields[1].w       = fkm1 - fkp1;
-        fields[2].w       = fkm2 - fkp2;
- 
-    } else {
-
-        fields[0].x       = 0.0f;
-        fields[0].y       = 0.0f;
-        fields[0].z       = 0.0f;
-        fields[0].w       = 0.0f;
-    
-        fields[1].x       = 0.0f;
-        fields[1].y       = 0.0f;
-        fields[1].z       = 0.0f;
-        fields[1].w       = 0.0f;
-    
-        fields[2].x       = 0.0f;
-        fields[2].y       = 0.0f;
-        fields[2].z       = 0.0f;
-        fields[2].w       = 0.0f;
-    }
-
-}
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateAmoebaCudaPmeFixedEField.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateAmoebaCudaPmeFixedEField.h"
-
-/**---------------------------------------------------------------------------------------
-
-   Report whether a number is a nan or infinity
-
-   @param number               number to test
-   @return 1 if number is  nan or infinity; else return 0
-
-   --------------------------------------------------------------------------------------- */
-
-/**---------------------------------------------------------------------------------------
-
-   Compute fixed electric field using PME
-
-   @param amoebaGpu        amoebaGpu context
-
-   --------------------------------------------------------------------------------------- */
-
-static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
-{
-  
-    static unsigned int threadsPerBlock  = 0;
-    gpuContext gpu                       = amoebaGpu->gpuContext;
-
-    kClearFields_3( amoebaGpu, 2 );
-
-    // on first pass, set threads/block
-
-    if( threadsPerBlock == 0 ){ 
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            maxThreads = 384; 
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 192;
-        else
-            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
-    }    
-
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                                           gpu->sim.pInteractingWorkUnit,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
-    } else {
-        kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                                           gpu->sim.pInteractingWorkUnit,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
-    }
-    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");
-
-    kReducePmeDirectE_Fields( amoebaGpu );
-
-}
-
-void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
-{
-
-    kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
-    cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
-
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "amoebaScaleFactors.h"
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(384, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(192, 1)
-#else
-__launch_bounds__(64, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
-                            unsigned int* workUnit,
-                            float* outputEField,
-                            float* outputEFieldPolar){ 
-
-    extern __shared__ FixedFieldParticle sA[];
-
-    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
-    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
-    unsigned int pos             = warp*numWorkUnits/totalWarps;
-    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
-    unsigned int lasty           = 0xFFFFFFFF;
-
-    while (pos < end)
-    {
-
-        unsigned int x;
-        unsigned int y;
-        bool bExclusionFlag;
-        float dScaleValue;
-        float pScaleValue;
-        int  dScaleMask;
-        int2 pScaleMask;
-
-        // extract cell coordinates
-
-        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
-
-        unsigned int tgx           = threadIdx.x & (GRID - 1);
-        unsigned int tbx           = threadIdx.x - tgx;
-        unsigned int tj            = tgx;
-
-        FixedFieldParticle* psA    = &sA[tbx];
-        unsigned int atomI         = x + tgx;
-        FixedFieldParticle localParticle;
-        loadFixedFieldShared( &localParticle, atomI );
-
-        float fieldSum[3];
-        float fieldPolarSum[3];
-
-        fieldSum[0]                = 0.0f;
-        fieldSum[1]                = 0.0f;
-        fieldSum[2]                = 0.0f;
-
-        fieldPolarSum[0]           = 0.0f;
-        fieldPolarSum[1]           = 0.0f;
-        fieldPolarSum[2]           = 0.0f;
-
-        if (x == y)
-        {
-
-            // load coordinates, charge, ...
-
-            loadFixedFieldShared( &(sA[threadIdx.x]), atomI );
-
-            if( bExclusionFlag ){
-                unsigned int xi       = x >> GRIDBITS;
-                unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                dScaleMask            = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                pScaleMask            = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-            } else {
-                dScaleValue = pScaleValue = 1.0f;
-
-            }
-
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-
-                if( bExclusionFlag ){
-                    getMaskedDScaleFactor( j, dScaleMask, &dScaleValue );
-                    getMaskedPScaleFactor( j, pScaleMask, &pScaleValue );
-                }
-
-                float4 ijField[3];
-                calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField);
-
-                // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
-                // by setting match flag
-
-                unsigned int match      = ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 1 : 0;
-
-                // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-
-                fieldSum[0]            += match ? 0.0f : ijField[0].x;
-                fieldSum[1]            += match ? 0.0f : ijField[1].x;
-                fieldSum[2]            += match ? 0.0f : ijField[2].x;
-
-                fieldPolarSum[0]       += match ? 0.0f : ijField[0].z;
-                fieldPolarSum[1]       += match ? 0.0f : ijField[1].z;
-                fieldPolarSum[2]       += match ? 0.0f : ijField[2].z;
-
-            }
-
-            // Write results
-
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
-            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
-#else
-            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray( offset, fieldSum,       outputEField );
-            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
-#endif
-
-        } else {
-
-            if (lasty != y ) {
-    
-                // load coordinates, charge, ...
-    
-                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
-    
-            }
-
-            unsigned int flags = cSim.pInteractionFlag[pos];
-            if (flags == 0) {
-                // No interactions in this block.
-            } else {
-
-                // zero shared fields
-
-                zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
-
-                if( bExclusionFlag ) {
-                    unsigned int xi   = x >> GRIDBITS;
-                    unsigned int yi   = y >> GRIDBITS;
-                    unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                    dScaleMask        = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                    pScaleMask        = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                } else {
-                    dScaleValue = pScaleValue  = 1.0f;
-                }
-
-                for (unsigned int j = 0; j < GRID; j++){
-
-                    if ((flags&(1<<j)) != 0) {
-                        unsigned int jIdx = (flags == 0xFFFFFFFF) ? tj : j;
-                        if( bExclusionFlag ){
-                            getMaskedDScaleFactor( jIdx, dScaleMask, &dScaleValue );
-                            getMaskedPScaleFactor( jIdx, pScaleMask, &pScaleValue );
-                        }
-
-                        float4 ijField[3];
-                        calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField);
-
-                        unsigned int outOfBounds     = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 1 : 0;
-
-                        // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-
-                        fieldSum[0]                 += outOfBounds ? 0.0f : ijField[0].x;
-                        fieldSum[1]                 += outOfBounds ? 0.0f : ijField[1].x;
-                        fieldSum[2]                 += outOfBounds ? 0.0f : ijField[2].x;
-
-                        fieldPolarSum[0]            += outOfBounds ? 0.0f : ijField[0].z;
-                        fieldPolarSum[1]            += outOfBounds ? 0.0f : ijField[1].z;
-                        fieldPolarSum[2]            += outOfBounds ? 0.0f : ijField[2].z;
-
-                        if( flags == 0xFFFFFFFF ){
-
-                            // add to field at atomJ the field due atomI's charge/dipole/quadrupole
-
-                            psA[jIdx].eField[0]        += outOfBounds ? 0.0f : ijField[0].y;
-                            psA[jIdx].eField[1]        += outOfBounds ? 0.0f : ijField[1].y;
-                            psA[jIdx].eField[2]        += outOfBounds ? 0.0f : ijField[2].y;
-
-                            psA[jIdx].eFieldP[0]       += outOfBounds ? 0.0f : ijField[0].w;
-                            psA[jIdx].eFieldP[1]       += outOfBounds ? 0.0f : ijField[1].w;
-                            psA[jIdx].eFieldP[2]       += outOfBounds ? 0.0f : ijField[2].w;
-
-                        } else {
-
-                            sA[threadIdx.x].tempBuffer[0]  = outOfBounds ? 0.0f : ijField[0].y;
-                            sA[threadIdx.x].tempBuffer[1]  = outOfBounds ? 0.0f : ijField[1].y;
-                            sA[threadIdx.x].tempBuffer[2]  = outOfBounds ? 0.0f : ijField[2].y;
-
-                            sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[0].w;
-                            sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[1].w;
-                            sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[2].w;
-
-                            if( tgx % 2 == 0 ){
-                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
-                            }
-                            if( tgx % 4 == 0 ){
-                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
-                            }
-                            if( tgx % 8 == 0 ){
-                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
-                            }
-                            if( tgx % 16 == 0 ){
-                                sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
-                            }
-
-                            if (tgx == 0)
-                            {
-                                psA[jIdx].eField[0]  += sA[threadIdx.x].tempBuffer[0]  + sA[threadIdx.x+16].tempBuffer[0];
-                                psA[jIdx].eField[1]  += sA[threadIdx.x].tempBuffer[1]  + sA[threadIdx.x+16].tempBuffer[1];
-                                psA[jIdx].eField[2]  += sA[threadIdx.x].tempBuffer[2]  + sA[threadIdx.x+16].tempBuffer[2];
-
-                                psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
-                                psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
-                                psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
-                            }
-                        }
-
-                    }
-                    tj = (tj + 1) & (GRID - 1);
-
-                } // j-loop block
-    
-                // Write results
-    
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-                unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
-                load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
-                load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
-    
-                offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
-                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField,  outputEField );
-                load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
-    
-#else
-                unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-                load3dArray( offset, fieldSum,       outputEField );
-                load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
-    
-                offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-                load3dArray( offset, sA[threadIdx.x].eField,  outputEField );
-                load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
-     
-#endif
-            } // end of pInteractionFlag block 
-            lasty = y;
-        } // x == y block
-
-        pos++;
-    }
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "amoebaGpuTypes.h"
-#include "amoebaCudaKernels.h"
-#include "kCalculateAmoebaCudaUtilities.h"
-#include "openmm/OpenMMException.h"
-
-#include <stdio.h>
-
-using namespace std;
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-
-void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-
-void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-
-#undef INCLUDE_MI_FIELD_BUFFERS
-#define INCLUDE_MI_FIELD_BUFFERS 
-#include "kCalculateAmoebaCudaMutualInducedParticle.h"
-#ifdef INCLUDE_MI_FIELD_BUFFERS
-__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
-
-    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
-    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
-    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];
-
-    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
-    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
-    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
-}
-#endif
-
-// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
-
-__device__ void setupMutualInducedFieldPairIxn_kernel( const MutualInducedParticle& atomI, const MutualInducedParticle& atomJ,
-                                                       const float uscale, float4* delta, float* preFactor2 ) {
-
-    // compute thedelta->xeal space portion of the Ewald summation
-  
-    delta->x                = atomJ.x - atomI.x;
-    delta->y                = atomJ.y - atomI.y;
-    delta->z                = atomJ.z - atomI.z;
-
-    // pdelta->xiodic boundary conditions
-
-    delta->x               -= floorf(delta->x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-    delta->y               -= floorf(delta->y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-    delta->z               -= floorf(delta->z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-
-    float r2                = (delta->x*delta->x) + (delta->y*delta->y) + (delta->z*delta->z); 
-    if( r2 <= cSim.nonbondedCutoffSqr ){
-        float r           = sqrtf(r2);
-
-        // calculate the error function damping terms
-
-        float ralpha      = cSim.alphaEwald*r;
-
-        float bn0         = erfcf(ralpha)/r;
-        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
-        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
-        float exp2a       = expf(-(ralpha*ralpha));
-        alsq2n           *= alsq2;
-        float bn1         = (bn0+alsq2n*exp2a)/r2;
-
-        alsq2n           *= alsq2;
-        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;
-
-        // compute the error function scaled and unscaled terms
-
-        float scale3      = 1.0f;
-        float scale5      = 1.0f;
-        float damp        = atomI.damp*atomJ.damp;
-        if( damp != 0.0f ){
-
-            float ratio  = (r/damp);
-                  ratio  = ratio*ratio*ratio;
-            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
-                  damp   = -pgamma*ratio;
-
-            if( damp > -50.0f) {
-                float expdamp = expf(damp);
-                scale3        = 1.0f - expdamp;
-                scale5        = 1.0f - expdamp*(1.0f-damp);
-            }
-        }
-        float dsc3        = uscale*scale3;
-        float dsc5        = uscale*scale5;
-
-        float r3          = (r*r2);
-        float r5          = (r3*r2);
-        float rr3         = (1.0f-dsc3)/r3;
-        float rr5         = 3.0f*(1.0f-dsc5)/r5;
-
-        delta->w          = rr3 - bn1;
-        *preFactor2       = bn2 - rr5;
-    } else {
-        delta->w = *preFactor2 = 0.0f;
-    }
-}
-
-__device__ void calculateMutualInducedFieldPairIxn_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
-
-    float preFactor3  = preFactor2*(inducedDipole[0]*delta.x   + inducedDipole[1]*delta.y  + inducedDipole[2]*delta.z);
-
-    fieldSum[0]      += preFactor3*delta.x + delta.w*inducedDipole[0];
-    fieldSum[1]      += preFactor3*delta.y + delta.w*inducedDipole[1];
-    fieldSum[2]      += preFactor3*delta.z + delta.w*inducedDipole[2];
-}
-
-__device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
-
-    float preFactor3  = preFactor2*(inducedDipole[0]*delta.x   + inducedDipole[1]*delta.y  + inducedDipole[2]*delta.z);
-
-    fieldSum[0]       = preFactor3*delta.x + delta.w*inducedDipole[0];
-    fieldSum[1]       = preFactor3*delta.y + delta.w*inducedDipole[1];
-    fieldSum[2]       = preFactor3*delta.z + delta.w*inducedDipole[2];
-}
-
-// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
-
-__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
-                                                                    float uscale, float4 fields[3] ){
-
-    // compute the real space portion of the Ewald summation
-  
-    float xr          = atomJ.x - atomI.x;
-    float yr          = atomJ.y - atomI.y;
-    float zr          = atomJ.z - atomI.z;
-
-    // periodic boundary conditions
-
-    xr               -= floorf(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-    yr               -= floorf(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-    zr               -= floorf(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-
-    float r2          = xr*xr + yr* yr + zr*zr;
-    if( r2 <= cSim.nonbondedCutoffSqr ){
-        float r           = sqrtf(r2);
-
-        // calculate the error function damping terms
-
-        float ralpha      = cSim.alphaEwald*r;
-
-        float bn0         = erfcf(ralpha)/r;
-        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
-        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
-        float exp2a       = expf(-(ralpha*ralpha));
-        alsq2n           *= alsq2;
-        float bn1         = (bn0+alsq2n*exp2a)/r2;
-
-        alsq2n           *= alsq2;
-        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;
-
-        // compute the error function scaled and unscaled terms
-
-        float scale3      = 1.0f;
-        float scale5      = 1.0f;
-        float damp        = atomI.damp*atomJ.damp;
-        if( damp != 0.0f ){
-
-            float ratio  = (r/damp);
-                  ratio  = ratio*ratio*ratio;
-            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
-                  damp   = -pgamma*ratio;
-
-            if( damp > -50.0f) {
-                float expdamp = expf(damp);
-                scale3        = 1.0f - expdamp;
-                scale5        = 1.0f - expdamp*(1.0f-damp);
-            }
-        }
-        float dsc3        = uscale*scale3;
-        float dsc5        = uscale*scale5;
-
-        float r3          = (r*r2);
-        float r5          = (r3*r2);
-        float rr3         = (1.0f-dsc3)/r3;
-        float rr5         = 3.0f*(1.0f-dsc5)/r5;
-
-        float preFactor1  = rr3 - bn1;
-        float preFactor2  = bn2 - rr5;
-
-        float dukr        = atomJ.inducedDipole[0]*xr      + atomJ.inducedDipole[1]*yr      + atomJ.inducedDipole[2]*zr;
-        float preFactor3  = preFactor2*dukr;
-
-        fields[0].x       = preFactor3*xr + preFactor1*atomJ.inducedDipole[0];
-        fields[1].x       = preFactor3*yr + preFactor1*atomJ.inducedDipole[1];
-        fields[2].x       = preFactor3*zr + preFactor1*atomJ.inducedDipole[2];
-
-
-        float duir        = atomI.inducedDipole[0]*xr      + atomI.inducedDipole[1]*yr      + atomI.inducedDipole[2]*zr;
-        preFactor3        = preFactor2*duir;
-
-        fields[0].y       = preFactor3*xr + preFactor1*atomI.inducedDipole[0];
-        fields[1].y       = preFactor3*yr + preFactor1*atomI.inducedDipole[1];
-        fields[2].y       = preFactor3*zr + preFactor1*atomI.inducedDipole[2];
-
-
-        float pukr        = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
-        preFactor3        = preFactor2*pukr;
-
-        fields[0].z       = preFactor3*xr + preFactor1*atomJ.inducedDipolePolar[0];
-        fields[1].z       = preFactor3*yr + preFactor1*atomJ.inducedDipolePolar[1];
-        fields[2].z       = preFactor3*zr + preFactor1*atomJ.inducedDipolePolar[2];
-
-
-        float puir        = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
-        preFactor3        = preFactor2*puir;
-        fields[0].w       = preFactor3*xr + preFactor1*atomI.inducedDipolePolar[0];
-        fields[1].w       = preFactor3*yr + preFactor1*atomI.inducedDipolePolar[1];
-        fields[2].w       = preFactor3*zr + preFactor1*atomI.inducedDipolePolar[2];
-
-    } else {
-
-        fields[0].x       = 0.0f;
-        fields[0].y       = 0.0f;
-        fields[0].z       = 0.0f;
-        fields[0].w       = 0.0f;
-    
-        fields[1].x       = 0.0f;
-        fields[1].y       = 0.0f;
-        fields[1].z       = 0.0f;
-        fields[1].w       = 0.0f;
-    
-        fields[2].x       = 0.0f;
-        fields[2].y       = 0.0f;
-        fields[2].z       = 0.0f;
-        fields[2].w       = 0.0f;
-    }
-}
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-static void kInitializeMutualInducedField_kernel(
-                   int numberOfAtoms,
-                   float* fixedEField,
-                   float* fixedEFieldPolar,
-                   float* polarizability )
-{
-
-    int pos = blockIdx.x*blockDim.x + threadIdx.x;
-    while( pos < 3*cSim.atoms )
-    {   
-        fixedEField[pos]         *= polarizability[pos];
-        fixedEFieldPolar[pos]    *= polarizability[pos];
-
-        pos                      += blockDim.x*gridDim.x;
-    }
-
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDeltas1, float* arrayOfDeltas2, float* epsilon )
-{
-    extern __shared__ float2 delta[];
-
-    delta[threadIdx.x].x    = 0.0f;
-    delta[threadIdx.x].y    = 0.0f;
-
-    unsigned int pos = threadIdx.x;
-
-    // load deltas
-
-    while( pos < numberOfEntries )
-    {   
-        delta[threadIdx.x].x  += arrayOfDeltas1[pos];
-        delta[threadIdx.x].y  += arrayOfDeltas2[pos];
-        pos                   += blockDim.x*gridDim.x;
-    }   
-    __syncthreads();
-
-    // sum the deltas
-
-    for (int offset = 1; offset < blockDim.x; offset *= 2 )
-    {   
-        if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
-        {
-            delta[threadIdx.x].x   += delta[threadIdx.x+offset].x;
-            delta[threadIdx.x].y   += delta[threadIdx.x+offset].y;
-        }
-        __syncthreads();
-    }   
-
-    // set epsilons
-
-    if (threadIdx.x == 0)
-    {   
-        epsilon[0]  = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
-        epsilon[0]  = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
-    }   
-}
-
-/**
-
-   matrixProduct/matrixProductP contains epsilon**2 on output
-
-*/
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-static void kSorUpdateMutualInducedField_kernel(
-                   float* polarizability,
-                   float* inducedDipole, float* inducedDipoleP,
-                   float* fixedEField,   float* fixedEFieldP,
-                   float* matrixProduct, float* matrixProductP )
-{
-
-    int pos                        = blockIdx.x*blockDim.x + threadIdx.x;
-    const float term               = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
-    const float polarSOR           = 0.55f;
-
-    while( pos < 3*cSim.atoms )
-    {   
-
-        float previousDipole           = inducedDipole[pos];
-        float previousDipoleP          = inducedDipoleP[pos];
-    
-        // add self terms to fields
-    
-        float mProd                    = matrixProduct[pos];
-        float mProdP                   = matrixProductP[pos];
-
-        mProd                         += term*previousDipole;
-        mProdP                        += term*previousDipoleP;
-    
-        float inducedDipoleI           = fixedEField[pos]     + polarizability[pos]*mProd;
-        float inducedDipoleIP          = fixedEFieldP[pos]    + polarizability[pos]*mProdP;
-    
-        inducedDipole[pos]             = previousDipole   + polarSOR*( inducedDipoleI   - previousDipole  );   
-        inducedDipoleP[pos]            = previousDipoleP  + polarSOR*( inducedDipoleIP  - previousDipoleP );
-    
-        matrixProduct[pos]             = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
-        matrixProductP[pos]            = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
-
-        pos                           += blockDim.x*gridDim.x;
-    }
-
-}
-
-// reduce psWorkArray_3_1
-// reduce psWorkArray_3_2
-
-static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
-{
-    gpuContext gpu = amoebaGpu->gpuContext;
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
-    LAUNCHERROR("kReducePmeMI_Fields1");
-
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
-    LAUNCHERROR("kReducePmeMI_Fields2");
-}
-
-/**---------------------------------------------------------------------------------------
-
-   Compute mutual induce field
-
-   @param amoebaGpu        amoebaGpu context
-
-   --------------------------------------------------------------------------------------- */
-
-static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
-                                                                  CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
-{
-  
-    static unsigned int threadsPerBlock  = 0;
-    gpuContext gpu                       = amoebaGpu->gpuContext;
-
-    kClearFields_3( amoebaGpu, 2 );
-
-    // on first pass, set threads/block
-
-    if( threadsPerBlock == 0 ){  
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            maxThreads = 384; 
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 128; 
-        else
-            maxThreads = 64; 
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
-    }    
-
-    if (gpu->bOutputBufferPerWarp){
-
-        kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
-                                                                 gpu->sim.pInteractingWorkUnit,
-                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevData );
-
-    } else {
-
-        kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
-                                                                 gpu->sim.pInteractingWorkUnit,
-                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevData );
-
-    }
-    LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
-
-    kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
-
-}
-
-/**---------------------------------------------------------------------------------------
-
-   Compute mutual induce field
-
-   @param amoebaGpu        amoebaGpu context
-
-   --------------------------------------------------------------------------------------- */
-
-static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu )
-{
-  
-   // ---------------------------------------------------------------------------------------
-
-    int done;
-    int iteration;
-
-    gpuContext gpu    = amoebaGpu->gpuContext;
-
-    // ---------------------------------------------------------------------------------------
-
-    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
-    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
-
-    kInitializeMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
-         gpu->natoms,
-         amoebaGpu->psE_Field->_pDevData,
-         amoebaGpu->psE_FieldPolar->_pDevData,
-         amoebaGpu->psPolarizability->_pDevData );
-    LAUNCHERROR("AmoebaPmeMutualInducedFieldSetup");  
-
-    cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData,        amoebaGpu->psE_Field->_pDevData,       3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
-    cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData,   amoebaGpu->psE_FieldPolar->_pDevData,  3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
-
-    // if polarization type is direct, set flags signalling done and return
-
-    if( amoebaGpu->amoebaSim.polarizationType )
-    {
-        amoebaGpu->mutualInducedDone          = 1;
-        amoebaGpu->mutualInducedConverged     = 1;
-        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
-        return;
-    }
-
-    // ---------------------------------------------------------------------------------------
- 
-    done      = 0;
-    iteration = 1;
-
-    while( !done ){
-
-        //  apply SOR
-
-        cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1] );
-        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
-
-        // post matrix multiply
-
-        kSorUpdateMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
-           amoebaGpu->psPolarizability->_pDevData,
-           amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
-           amoebaGpu->psE_Field->_pDevData,       amoebaGpu->psE_FieldPolar->_pDevData,
-           amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
-        LAUNCHERROR("kSorUpdatePmeMutualInducedField");  
-
-        // get total epsilon -- performing sums on gpu
-
-        kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
-           3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
-           amoebaGpu->psCurrentEpsilon->_pDevData );
-        LAUNCHERROR("kReducePmeMutualInducedFieldDelta");
-
-        // Debye=48.033324f
-        amoebaGpu->psCurrentEpsilon->Download();
-        float currentEpsilon                     = amoebaGpu->psCurrentEpsilon->_pSysData[0];
-        amoebaGpu->mutualInducedCurrentEpsilon   = currentEpsilon;
-
-        if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){ 
-            done = 1;
-        }
-
-        // throw exception if nan detected
-
-        if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){
-            throw OpenMM::OpenMMException("PME induced dipole calculation detected nans." );
-        }
-
-        iteration++;
-    }
-
-    amoebaGpu->mutualInducedDone             = done;
-    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
-
-}
-
-void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )
-{
-    if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
-        cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpu );
-    }
-}