Merge branch 'master' into nucleic

fd473eea · Peter Eastman · 0a751b5b · 6a985cfd · fd473eea · fd473eea
Commit fd473eea authored Oct 29, 2015 by Peter Eastman
2 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

 typedef struct {
-    real4 posq;
-    real3 force, dipole, inducedDipole, inducedDipolePolar;
+    real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
+    real q;
+    float thole, damp;
 #ifdef INCLUDE_QUADRUPOLES
-    real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
-    real quadrupoleYY, quadrupoleYZ;
+    real sphericalQuadrupole[5];
 #endif
-    float thole, damp;
 } AtomData;

-__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real& energy, real3& outputForce);
-__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce);
-__device__ void computeOneInteractionT3(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce);
-
-inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
-        const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
-    data.posq = posq[atom];
-    data.dipole.x = labFrameDipole[atom*3];
-    data.dipole.y = labFrameDipole[atom*3+1];
-    data.dipole.z = labFrameDipole[atom*3+2];
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ sphericalDipole,
+        const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+    real4 atomPosq = posq[atom];
+    data.pos = make_real3(atomPosq.x, atomPosq.y, atomPosq.z);
+    data.q = atomPosq.w;
+    data.sphericalDipole.x = sphericalDipole[atom*3];
+    data.sphericalDipole.y = sphericalDipole[atom*3+1];
+    data.sphericalDipole.z = sphericalDipole[atom*3+2];
 #ifdef INCLUDE_QUADRUPOLES
-    data.quadrupoleXX = labFrameQuadrupole[atom*5];
-    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
-    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
-    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
-    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
+    data.sphericalQuadrupole[0] = sphericalQuadrupole[atom*5];
+    data.sphericalQuadrupole[1] = sphericalQuadrupole[atom*5+1];
+    data.sphericalQuadrupole[2] = sphericalQuadrupole[atom*5+2];
+    data.sphericalQuadrupole[3] = sphericalQuadrupole[atom*5+3];
+    data.sphericalQuadrupole[4] = sphericalQuadrupole[atom*5+4];
 #endif
    data.inducedDipole.x = inducedDipole[atom*3];
    data.inducedDipole.y = inducedDipole[atom*3+1];
@@ -57,11 +54,327 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
    return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f));
 }

+__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor, mixed& energy) {
+    // Compute the displacement.
+    
+    real3 delta;
+    delta.x = atom2.pos.x - atom1.pos.x;
+    delta.y = atom2.pos.y - atom1.pos.y;
+    delta.z = atom2.pos.z - atom1.pos.z;
+    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+    real rInv = RSQRT(r2);
+    real r = r2*rInv;
+
+    // Rotate the various dipoles and quadrupoles.
+
+    real qiRotationMatrix[3][3];
+    buildQIRotationMatrix(delta, rInv, qiRotationMatrix);
+
+    
+    real3 qiUindI = 0.5f*make_real3(qiRotationMatrix[0][1]*atom1.inducedDipole.x + qiRotationMatrix[0][2]*atom1.inducedDipole.y + qiRotationMatrix[0][0]*atom1.inducedDipole.z,
+                                    qiRotationMatrix[1][1]*atom1.inducedDipole.x + qiRotationMatrix[1][2]*atom1.inducedDipole.y + qiRotationMatrix[1][0]*atom1.inducedDipole.z,
+                                    qiRotationMatrix[2][1]*atom1.inducedDipole.x + qiRotationMatrix[2][2]*atom1.inducedDipole.y + qiRotationMatrix[2][0]*atom1.inducedDipole.z);
+    real3 qiUindJ = 0.5f*make_real3(qiRotationMatrix[0][1]*atom2.inducedDipole.x + qiRotationMatrix[0][2]*atom2.inducedDipole.y + qiRotationMatrix[0][0]*atom2.inducedDipole.z,
+                                    qiRotationMatrix[1][1]*atom2.inducedDipole.x + qiRotationMatrix[1][2]*atom2.inducedDipole.y + qiRotationMatrix[1][0]*atom2.inducedDipole.z,
+                                    qiRotationMatrix[2][1]*atom2.inducedDipole.x + qiRotationMatrix[2][2]*atom2.inducedDipole.y + qiRotationMatrix[2][0]*atom2.inducedDipole.z);
+    real3 qiUinpI = 0.5f*make_real3(qiRotationMatrix[0][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[0][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[0][0]*atom1.inducedDipolePolar.z,
+                                    qiRotationMatrix[1][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[1][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[1][0]*atom1.inducedDipolePolar.z,
+                                    qiRotationMatrix[2][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[2][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[2][0]*atom1.inducedDipolePolar.z);
+    real3 qiUinpJ = 0.5f*make_real3(qiRotationMatrix[0][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[0][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[0][0]*atom2.inducedDipolePolar.z,
+                                    qiRotationMatrix[1][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[1][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[1][0]*atom2.inducedDipolePolar.z,
+                                    qiRotationMatrix[2][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[2][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[2][0]*atom2.inducedDipolePolar.z);
+    
+    real3 rotatedDipole1 = rotateDipole(atom1.sphericalDipole, qiRotationMatrix);
+    real3 rotatedDipole2 = rotateDipole(atom2.sphericalDipole, qiRotationMatrix);
+    real rotatedQuadrupole1[] = {0, 0, 0, 0, 0};
+    real rotatedQuadrupole2[] = {0, 0, 0, 0, 0};
+#ifdef INCLUDE_QUADRUPOLES
+    rotateQuadupoles(qiRotationMatrix, atom1.sphericalQuadrupole, atom2.sphericalQuadrupole, rotatedQuadrupole1, rotatedQuadrupole2);
+#endif    
+    
+    // The field derivatives at I due to permanent and induced moments on J, and vice-versa.
+    // Also, their derivatives w.r.t. R, which are needed for force calculations
+    real Vij[9], Vji[9], VjiR[9], VijR[9];
+    // The field derivatives at I due to only permanent moments on J, and vice-versa.
+    real Vijp[3], Vijd[3], Vjip[3], Vjid[3];
+    real rInvVec[7];
+
+    // The rInvVec array is defined such that the ith element is R^-i, with the
+    // dieleectric constant folded in, to avoid conversions later.
+    rInvVec[1] = rInv;
+    for (int i = 2; i < 7; ++i)
+        rInvVec[i] = rInvVec[i-1] * rInv;
+
+    real dmp = atom1.damp*atom2.damp;
+    real a = min(atom1.thole, atom2.thole);
+    real u = fabs(dmp) > 1.0e-5f ? r/dmp : 1e10f;
+    real au3 = a*u*u*u;
+    real expau3 = au3 < 50 ? EXP(-au3) : 0;
+    real a2u6 = au3*au3;
+    real a3u9 = a2u6*au3;
+    // Thole damping factors for energies
+    real thole_c  = 1 - expau3;
+    real thole_d0 = 1 - expau3*(1 + 1.5f*au3);
+    real thole_d1 = 1 - expau3;
+    real thole_q0 = 1 - expau3*(1 + au3 + a2u6);
+    real thole_q1 = 1 - expau3*(1 + au3);
+    // Thole damping factors for derivatives
+    real dthole_c  = 1 - expau3*(1 + 1.5f*au3);
+    real dthole_d0 = 1 - expau3*(1 + au3 + 1.5f*a2u6);
+    real dthole_d1 = 1 - expau3*(1 + au3);
+    real dthole_q0 = 1 - expau3*(1 + au3 + 0.25f*a2u6 + 0.75f*a3u9);
+    real dthole_q1 = 1 - expau3*(1 + au3 + 0.75f*a2u6);
+
+    // Now we compute the (attenuated) Coulomb operator and its derivatives, contracted with
+    // permanent moments and induced dipoles.  Note that the coefficient of the permanent force
+    // terms is half of the expected value; this is because we compute the interaction of I with
+    // the sum of induced and permanent moments on J, as well as the interaction of J with I's
+    // permanent and induced moments; doing so double counts the permanent-permanent interaction.
+    real ePermCoef, dPermCoef, eUIndCoef, dUIndCoef, eUInpCoef, dUInpCoef;
+
+    // C-C terms (m=0)
+    ePermCoef = rInvVec[1]*mScale;
+    dPermCoef = -0.5f*mScale*rInvVec[2];
+    Vij[0]  = ePermCoef*atom2.q;
+    Vji[0]  = ePermCoef*atom1.q;
+    VijR[0] = dPermCoef*atom2.q;
+    VjiR[0] = dPermCoef*atom1.q;
+
+    // C-D and C-Uind terms (m=0)
+    ePermCoef = rInvVec[2]*mScale;
+    eUIndCoef = rInvVec[2]*pScale*thole_c;
+    eUInpCoef = rInvVec[2]*dScale*thole_c;
+    dPermCoef = -rInvVec[3]*mScale;
+    dUIndCoef = -2*rInvVec[3]*pScale*dthole_c;
+    dUInpCoef = -2*rInvVec[3]*dScale*dthole_c;
+    Vij[0]  += -(ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x);
+    Vji[1]   = -(ePermCoef*atom1.q);
+    VijR[0] += -(dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x);
+    VjiR[1]  = -(dPermCoef*atom1.q);
+    Vjip[0]  = -(eUInpCoef*atom1.q);
+    Vjid[0]  = -(eUIndCoef*atom1.q);
+    // D-C and Uind-C terms (m=0)
+    Vij[1]   = ePermCoef*atom2.q;
+    Vji[0]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1]  = dPermCoef*atom2.q;
+    VjiR[0] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0]  = eUInpCoef*atom2.q;
+    Vijd[0]  = eUIndCoef*atom2.q;
+
+    // D-D and D-Uind terms (m=0)
+    ePermCoef = -2*rInvVec[3]*mScale;
+    eUIndCoef = -2*rInvVec[3]*pScale*thole_d0;
+    eUInpCoef = -2*rInvVec[3]*dScale*thole_d0;
+    dPermCoef = 3*rInvVec[4]*mScale;
+    dUIndCoef = 6*rInvVec[4]*pScale*dthole_d0;
+    dUInpCoef = 6*rInvVec[4]*dScale*dthole_d0;
+    Vij[1]  += ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x;
+    Vji[1]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1] += dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x;
+    VjiR[1] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0] += eUInpCoef*rotatedDipole2.x;
+    Vijd[0] += eUIndCoef*rotatedDipole2.x;
+    Vjip[0] += eUInpCoef*rotatedDipole1.x;
+    Vjid[0] += eUIndCoef*rotatedDipole1.x;
+    // D-D and D-Uind terms (m=1)
+    ePermCoef = rInvVec[3]*mScale;
+    eUIndCoef = rInvVec[3]*pScale*thole_d1;
+    eUInpCoef = rInvVec[3]*dScale*thole_d1;
+    dPermCoef = -1.5f*rInvVec[4]*mScale;
+    dUIndCoef = -3*rInvVec[4]*pScale*dthole_d1;
+    dUInpCoef = -3*rInvVec[4]*dScale*dthole_d1;
+    Vij[2]  = ePermCoef*rotatedDipole2.y + eUIndCoef*qiUindJ.y + eUInpCoef*qiUinpJ.y;
+    Vji[2]  = ePermCoef*rotatedDipole1.y + eUIndCoef*qiUindI.y + eUInpCoef*qiUinpI.y;
+    VijR[2] = dPermCoef*rotatedDipole2.y + dUIndCoef*qiUindJ.y + dUInpCoef*qiUinpJ.y;
+    VjiR[2] = dPermCoef*rotatedDipole1.y + dUIndCoef*qiUindI.y + dUInpCoef*qiUinpI.y;
+    Vij[3]  = ePermCoef*rotatedDipole2.z + eUIndCoef*qiUindJ.z + eUInpCoef*qiUinpJ.z;
+    Vji[3]  = ePermCoef*rotatedDipole1.z + eUIndCoef*qiUindI.z + eUInpCoef*qiUinpI.z;
+    VijR[3] = dPermCoef*rotatedDipole2.z + dUIndCoef*qiUindJ.z + dUInpCoef*qiUinpJ.z;
+    VjiR[3] = dPermCoef*rotatedDipole1.z + dUIndCoef*qiUindI.z + dUInpCoef*qiUinpI.z;
+    Vijp[1] = eUInpCoef*rotatedDipole2.y;
+    Vijd[1] = eUIndCoef*rotatedDipole2.y;
+    Vjip[1] = eUInpCoef*rotatedDipole1.y;
+    Vjid[1] = eUIndCoef*rotatedDipole1.y;
+    Vijp[2] = eUInpCoef*rotatedDipole2.z;
+    Vijd[2] = eUIndCoef*rotatedDipole2.z;
+    Vjip[2] = eUInpCoef*rotatedDipole1.z;
+    Vjid[2] = eUIndCoef*rotatedDipole1.z;
+
+    // C-Q terms (m=0)
+    ePermCoef = mScale*rInvVec[3];
+    dPermCoef = -1.5f*rInvVec[4]*mScale;
+    Vij[0]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]   = ePermCoef*atom1.q;
+    VijR[0] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4]  = dPermCoef*atom1.q;
+    // Q-C terms (m=0)
+    Vij[4]   = ePermCoef*atom2.q;
+    Vji[0]  += ePermCoef*rotatedQuadrupole1[0];
+    VijR[4]  = dPermCoef*atom2.q;
+    VjiR[0] += dPermCoef*rotatedQuadrupole1[0];
+
+    // D-Q and Uind-Q terms (m=0)
+    ePermCoef = rInvVec[4]*3.0*mScale;
+    eUIndCoef = rInvVec[4]*3.0*pScale*thole_q0;
+    eUInpCoef = rInvVec[4]*3.0*dScale*thole_q0;
+    dPermCoef = -6*rInvVec[5]*mScale;
+    dUIndCoef = -12*rInvVec[5]*pScale*dthole_q0;
+    dUInpCoef = -12*rInvVec[5]*dScale*dthole_q0;
+    Vij[1]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0] += eUInpCoef*rotatedQuadrupole2[0];
+    Vijd[0] += eUIndCoef*rotatedQuadrupole2[0];
+    // Q-D and Q-Uind terms (m=0)
+    Vij[4]  += -(ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x);
+    Vji[1]  += -(ePermCoef*rotatedQuadrupole1[0]);
+    VijR[4] += -(dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x);
+    VjiR[1] += -(dPermCoef*rotatedQuadrupole1[0]);
+    Vjip[0] += -(eUInpCoef*rotatedQuadrupole1[0]);
+    Vjid[0] += -(eUIndCoef*rotatedQuadrupole1[0]);
+
+    // D-Q and Uind-Q terms (m=1)
+    const real sqrtThree = SQRT((real) 3);
+    ePermCoef = -sqrtThree*rInvVec[4]*mScale;
+    eUIndCoef = -sqrtThree*rInvVec[4]*pScale*thole_q1;
+    eUInpCoef = -sqrtThree*rInvVec[4]*dScale*thole_q1;
+    dPermCoef = 2*sqrtThree*rInvVec[5]*mScale;
+    dUIndCoef = 4*sqrtThree*rInvVec[5]*pScale*dthole_q1;
+    dUInpCoef = 4*sqrtThree*rInvVec[5]*dScale*dthole_q1;
+    Vij[2]  += ePermCoef*rotatedQuadrupole2[1];
+    Vji[5]   = ePermCoef*rotatedDipole1.y + eUIndCoef*qiUindI.y + eUInpCoef*qiUinpI.y;
+    VijR[2] += dPermCoef*rotatedQuadrupole2[1];
+    VjiR[5]  = dPermCoef*rotatedDipole1.y + dUIndCoef*qiUindI.y + dUInpCoef*qiUinpI.y;
+    Vij[3]  += ePermCoef*rotatedQuadrupole2[2];
+    Vji[6]   = ePermCoef*rotatedDipole1.z + eUIndCoef*qiUindI.z + eUInpCoef*qiUinpI.z;
+    VijR[3] += dPermCoef*rotatedQuadrupole2[2];
+    VjiR[6]  = dPermCoef*rotatedDipole1.z + dUIndCoef*qiUindI.z + dUInpCoef*qiUinpI.z;
+    Vijp[1] += eUInpCoef*rotatedQuadrupole2[1];
+    Vijd[1] += eUIndCoef*rotatedQuadrupole2[1];
+    Vijp[2] += eUInpCoef*rotatedQuadrupole2[2];
+    Vijd[2] += eUIndCoef*rotatedQuadrupole2[2];
+    // D-Q and Uind-Q terms (m=1)
+    Vij[5]   = -(ePermCoef*rotatedDipole2.y + eUIndCoef*qiUindJ.y + eUInpCoef*qiUinpJ.y);
+    Vji[2]  += -(ePermCoef*rotatedQuadrupole1[1]);
+    VijR[5]  = -(dPermCoef*rotatedDipole2.y + dUIndCoef*qiUindJ.y + dUInpCoef*qiUinpJ.y);
+    VjiR[2] += -(dPermCoef*rotatedQuadrupole1[1]);
+    Vij[6]   = -(ePermCoef*rotatedDipole2.z + eUIndCoef*qiUindJ.z + eUInpCoef*qiUinpJ.z);
+    Vji[3]  += -(ePermCoef*rotatedQuadrupole1[2]);
+    VijR[6]  = -(dPermCoef*rotatedDipole2.z + dUIndCoef*qiUindJ.z + dUInpCoef*qiUinpJ.z);
+    VjiR[3] += -(dPermCoef*rotatedQuadrupole1[2]);
+    Vjip[1] += -(eUInpCoef*rotatedQuadrupole1[1]);
+    Vjid[1] += -(eUIndCoef*rotatedQuadrupole1[1]);
+    Vjip[2] += -(eUInpCoef*rotatedQuadrupole1[2]);
+    Vjid[2] += -(eUIndCoef*rotatedQuadrupole1[2]);
+
+    // Q-Q terms (m=0)
+    ePermCoef = 6*rInvVec[5]*mScale;
+    dPermCoef = -15*rInvVec[6]*mScale;
+    Vij[4]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]  += ePermCoef*rotatedQuadrupole1[0];
+    VijR[4] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4] += dPermCoef*rotatedQuadrupole1[0];
+    // Q-Q terms (m=1)
+    ePermCoef = -4*rInvVec[5]*mScale;
+    dPermCoef = 10*rInvVec[6]*mScale;
+    Vij[5]  += ePermCoef*rotatedQuadrupole2[1];
+    Vji[5]  += ePermCoef*rotatedQuadrupole1[1];
+    VijR[5] += dPermCoef*rotatedQuadrupole2[1];
+    VjiR[5] += dPermCoef*rotatedQuadrupole1[1];
+    Vij[6]  += ePermCoef*rotatedQuadrupole2[2];
+    Vji[6]  += ePermCoef*rotatedQuadrupole1[2];
+    VijR[6] += dPermCoef*rotatedQuadrupole2[2];
+    VjiR[6] += dPermCoef*rotatedQuadrupole1[2];
+    // Q-Q terms (m=2)
+    ePermCoef = rInvVec[5]*mScale;
+    dPermCoef = -2.5f*rInvVec[6]*mScale;
+    Vij[7]  = ePermCoef*rotatedQuadrupole2[3];
+    Vji[7]  = ePermCoef*rotatedQuadrupole1[3];
+    VijR[7] = dPermCoef*rotatedQuadrupole2[3];
+    VjiR[7] = dPermCoef*rotatedQuadrupole1[3];
+    Vij[8]  = ePermCoef*rotatedQuadrupole2[4];
+    Vji[8]  = ePermCoef*rotatedQuadrupole1[4];
+    VijR[8] = dPermCoef*rotatedQuadrupole2[4];
+    VjiR[8] = dPermCoef*rotatedQuadrupole1[4];
+
+    // Evaluate the energies, forces and torques due to permanent+induced moments
+    // interacting with just the permanent moments.
+    energy += forceFactor*0.5f*(
+        atom1.q*Vij[0] + rotatedDipole1.x*Vij[1] + rotatedDipole1.y*Vij[2] + rotatedDipole1.z*Vij[3] + rotatedQuadrupole1[0]*Vij[4] + rotatedQuadrupole1[1]*Vij[5] + rotatedQuadrupole1[2]*Vij[6] + rotatedQuadrupole1[3]*Vij[7] + rotatedQuadrupole1[4]*Vij[8] +
+        atom2.q*Vji[0] + rotatedDipole2.x*Vji[1] + rotatedDipole2.y*Vji[2] + rotatedDipole2.z*Vji[3] + rotatedQuadrupole2[0]*Vji[4] + rotatedQuadrupole2[1]*Vji[5] + rotatedQuadrupole2[2]*Vji[6] + rotatedQuadrupole2[3]*Vji[7] + rotatedQuadrupole2[4]*Vji[8]);
+    real fIZ = atom1.q*VijR[0] + rotatedDipole1.x*VijR[1] + rotatedDipole1.y*VijR[2] + rotatedDipole1.z*VijR[3] + rotatedQuadrupole1[0]*VijR[4] + rotatedQuadrupole1[1]*VijR[5] + rotatedQuadrupole1[2]*VijR[6] + rotatedQuadrupole1[3]*VijR[7] + rotatedQuadrupole1[4]*VijR[8];
+    real fJZ = atom2.q*VjiR[0] + rotatedDipole2.x*VjiR[1] + rotatedDipole2.y*VjiR[2] + rotatedDipole2.z*VjiR[3] + rotatedQuadrupole2[0]*VjiR[4] + rotatedQuadrupole2[1]*VjiR[5] + rotatedQuadrupole2[2]*VjiR[6] + rotatedQuadrupole2[3]*VjiR[7] + rotatedQuadrupole2[4]*VjiR[8];
+    real EIX = rotatedDipole1.z*Vij[1] - rotatedDipole1.x*Vij[3] + sqrtThree*rotatedQuadrupole1[2]*Vij[4] + rotatedQuadrupole1[4]*Vij[5] - (sqrtThree*rotatedQuadrupole1[0]+rotatedQuadrupole1[3])*Vij[6] + rotatedQuadrupole1[2]*Vij[7] - rotatedQuadrupole1[1]*Vij[8];
+    real EIY = -rotatedDipole1.y*Vij[1] + rotatedDipole1.x*Vij[2] - sqrtThree*rotatedQuadrupole1[1]*Vij[4] + (sqrtThree*rotatedQuadrupole1[0]-rotatedQuadrupole1[3])*Vij[5] - rotatedQuadrupole1[4]*Vij[6] + rotatedQuadrupole1[1]*Vij[7] + rotatedQuadrupole1[2]*Vij[8];
+    real EIZ = -rotatedDipole1.z*Vij[2] + rotatedDipole1.y*Vij[3] - rotatedQuadrupole1[2]*Vij[5] + rotatedQuadrupole1[1]*Vij[6] - 2*rotatedQuadrupole1[4]*Vij[7] + 2*rotatedQuadrupole1[3]*Vij[8];
+    real EJX = rotatedDipole2.z*Vji[1] - rotatedDipole2.x*Vji[3] + sqrtThree*rotatedQuadrupole2[2]*Vji[4] + rotatedQuadrupole2[4]*Vji[5] - (sqrtThree*rotatedQuadrupole2[0]+rotatedQuadrupole2[3])*Vji[6] + rotatedQuadrupole2[2]*Vji[7] - rotatedQuadrupole2[1]*Vji[8];
+    real EJY = -rotatedDipole2.y*Vji[1] + rotatedDipole2.x*Vji[2] - sqrtThree*rotatedQuadrupole2[1]*Vji[4] + (sqrtThree*rotatedQuadrupole2[0]-rotatedQuadrupole2[3])*Vji[5] - rotatedQuadrupole2[4]*Vji[6] + rotatedQuadrupole2[1]*Vji[7] + rotatedQuadrupole2[2]*Vji[8];
+    real EJZ = -rotatedDipole2.z*Vji[2] + rotatedDipole2.y*Vji[3] - rotatedQuadrupole2[2]*Vji[5] + rotatedQuadrupole2[1]*Vji[6] - 2*rotatedQuadrupole2[4]*Vji[7] + 2*rotatedQuadrupole2[3]*Vji[8];
+
+    // Define the torque intermediates for the induced dipoles. These are simply the induced dipole torque
+    // intermediates dotted with the field due to permanent moments only, at each center. We inline the
+    // induced dipole torque intermediates here, for simplicity. N.B. There are no torques on the dipoles
+    // themselves, so we accumulate the torque intermediates into separate variables to allow them to be
+    // used only in the force calculation.
+    //
+    // The torque about the x axis (needed to obtain the y force on the induced dipoles, below)
+    //    qiUindIx[0] = qiQUindI[2];    qiUindIx[1] = 0;    qiUindIx[2] = -qiQUindI[0]
+    real iEIX = qiUinpI.z*Vijp[0] + qiUindI.z*Vijd[0] - qiUinpI.x*Vijp[2] - qiUindI.x*Vijd[2];
+    real iEJX = qiUinpJ.z*Vjip[0] + qiUindJ.z*Vjid[0] - qiUinpJ.x*Vjip[2] - qiUindJ.x*Vjid[2];
+    // The torque about the y axis (needed to obtain the x force on the induced dipoles, below)
+    //    qiUindIy[0] = -qiQUindI[1];   qiUindIy[1] = qiQUindI[0];    qiUindIy[2] = 0
+    real iEIY = qiUinpI.x*Vijp[1] + qiUindI.x*Vijd[1] - qiUinpI.y*Vijp[0] - qiUindI.y*Vijd[0];
+    real iEJY = qiUinpJ.x*Vjip[1] + qiUindJ.x*Vjid[1] - qiUinpJ.y*Vjip[0] - qiUindJ.y*Vjid[0];
+
+#ifdef USE_MUTUAL_POLARIZATION
+    // Uind-Uind terms (m=0)
+    real eCoef = -4*rInvVec[3]*thole_d0;
+    real dCoef = 6*rInvVec[4]*dthole_d0;
+    iEIX += eCoef*(qiUinpI.z*qiUindJ.x + qiUindI.z*qiUinpJ.x);
+    iEJX += eCoef*(qiUinpJ.z*qiUindI.x + qiUindJ.z*qiUinpI.x);
+    iEIY -= eCoef*(qiUinpI.y*qiUindJ.x + qiUindI.y*qiUinpJ.x);
+    iEJY -= eCoef*(qiUinpJ.y*qiUindI.x + qiUindJ.y*qiUinpI.x);
+    fIZ += dCoef*(qiUinpI.x*qiUindJ.x + qiUindI.x*qiUinpJ.x);
+    fIZ += dCoef*(qiUinpJ.x*qiUindI.x + qiUindJ.x*qiUinpI.x);
+    // Uind-Uind terms (m=1)
+    eCoef = 2*rInvVec[3]*thole_d1;
+    dCoef = -3*rInvVec[4]*dthole_d1;
+    iEIX -= eCoef*(qiUinpI.x*qiUindJ.z + qiUindI.x*qiUinpJ.z);
+    iEJX -= eCoef*(qiUinpJ.x*qiUindI.z + qiUindJ.x*qiUinpI.z);
+    iEIY += eCoef*(qiUinpI.x*qiUindJ.y + qiUindI.x*qiUinpJ.y);
+    iEJY += eCoef*(qiUinpJ.x*qiUindI.y + qiUindJ.x*qiUinpI.y);
+    fIZ += dCoef*(qiUinpI.y*qiUindJ.y + qiUindI.y*qiUinpJ.y + qiUinpI.z*qiUindJ.z + qiUindI.z*qiUinpJ.z);
+    fIZ += dCoef*(qiUinpJ.y*qiUindI.y + qiUindJ.y*qiUinpI.y + qiUinpJ.z*qiUindI.z + qiUindJ.z*qiUinpI.z);
+#endif
+
+    // The quasi-internal frame forces and torques.  Note that the induced torque intermediates are
+    // used in the force expression, but not in the torques; the induced dipoles are isotropic.
+    real qiForce[3] = {rInv*(EIY+EJY+iEIY+iEJY), -rInv*(EIX+EJX+iEIX+iEJX), -(fJZ+fIZ)};
+    real qiTorqueI[3] = {-EIX, -EIY, -EIZ};
+    real qiTorqueJ[3] = {-EJX, -EJY, -EJZ};
+
+
+    real3 force = make_real3(qiRotationMatrix[1][1]*qiForce[0] + qiRotationMatrix[2][1]*qiForce[1] + qiRotationMatrix[0][1]*qiForce[2],
+                             qiRotationMatrix[1][2]*qiForce[0] + qiRotationMatrix[2][2]*qiForce[1] + qiRotationMatrix[0][2]*qiForce[2],
+                             qiRotationMatrix[1][0]*qiForce[0] + qiRotationMatrix[2][0]*qiForce[1] + qiRotationMatrix[0][0]*qiForce[2]);
+    atom1.force += force;
+    atom1.torque += make_real3(qiRotationMatrix[1][1]*qiTorqueI[0] + qiRotationMatrix[2][1]*qiTorqueI[1] + qiRotationMatrix[0][1]*qiTorqueI[2],
+                               qiRotationMatrix[1][2]*qiTorqueI[0] + qiRotationMatrix[2][2]*qiTorqueI[1] + qiRotationMatrix[0][2]*qiTorqueI[2],
+                               qiRotationMatrix[1][0]*qiTorqueI[0] + qiRotationMatrix[2][0]*qiTorqueI[1] + qiRotationMatrix[0][0]*qiTorqueI[2]);
+    if (forceFactor == 1) {
+        atom2.force -= force;
+        atom2.torque += make_real3(qiRotationMatrix[1][1]*qiTorqueJ[0] + qiRotationMatrix[2][1]*qiTorqueJ[1] + qiRotationMatrix[0][1]*qiTorqueJ[2],
+                                   qiRotationMatrix[1][2]*qiTorqueJ[0] + qiRotationMatrix[2][2]*qiTorqueJ[1] + qiRotationMatrix[0][2]*qiTorqueJ[2],
+                                   qiRotationMatrix[1][0]*qiTorqueJ[0] + qiRotationMatrix[2][0]*qiTorqueJ[1] + qiRotationMatrix[0][0]*qiTorqueJ[2]);
+    }
+}
+
 /**
 * Compute electrostatic interactions.
 */
 extern "C" __global__ void computeElectrostatics(
-        unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
+        unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, mixed* __restrict__ energyBuffer,
        const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
 #ifdef USE_CUTOFF
@@ -69,16 +382,15 @@ extern "C" __global__ void computeElectrostatics(
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const unsigned int* __restrict__ interactingAtoms,
 #endif
-        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
+        const real* __restrict__ sphericalDipole, const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole,
        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-    real energy = 0;
+    mixed energy = 0;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];

-
    // First loop: process tiles that contain exclusions.
    
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
@@ -89,21 +401,23 @@ extern "C" __global__ void computeElectrostatics(
        const unsigned int y = tileIndices.y;
        AtomData data;
        unsigned int atom1 = x*TILE_SIZE + tgx;
-        loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+        loadAtomData(data, atom1, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
        data.force = make_real3(0);
+        data.torque = make_real3(0);
        uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
        unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
        if (x == y) {
            // This tile is on the diagonal.

-            localData[threadIdx.x].posq = data.posq;
-            localData[threadIdx.x].dipole = data.dipole;
+            localData[threadIdx.x].pos = data.pos;
+            localData[threadIdx.x].q = data.q;
+            localData[threadIdx.x].sphericalDipole = data.sphericalDipole;
 #ifdef INCLUDE_QUADRUPOLES
-            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
-            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
-            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
-            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
-            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+            localData[threadIdx.x].sphericalQuadrupole[0] = data.sphericalQuadrupole[0];
+            localData[threadIdx.x].sphericalQuadrupole[1] = data.sphericalQuadrupole[1];
+            localData[threadIdx.x].sphericalQuadrupole[2] = data.sphericalQuadrupole[2];
+            localData[threadIdx.x].sphericalQuadrupole[3] = data.sphericalQuadrupole[3];
+            localData[threadIdx.x].sphericalQuadrupole[4] = data.sphericalQuadrupole[4];
 #endif
            localData[threadIdx.x].inducedDipole = data.inducedDipole;
            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
@@ -115,101 +429,57 @@ extern "C" __global__ void computeElectrostatics(
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = y*TILE_SIZE+j;
                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    real tempEnergy;
                    float d = computeDScaleFactor(polarizationGroup, j);
                    float p = computePScaleFactor(covalent, polarizationGroup, j);
                    float m = computeMScaleFactor(covalent, j);
-                    computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
-                    data.force += tempForce;
-                    energy += 0.5f*tempEnergy;
+                    computeOneInteraction(data, localData[tbx+j], true, d, p, m, 0.5f, energy);
                }
            }
-            data.force *= ENERGY_SCALE_FACTOR;
+            data.force *= -ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-
-            // Compute torques.
-
-            data.force = make_real3(0);
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                int atom2 = y*TILE_SIZE+j;
-                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    float d = computeDScaleFactor(polarizationGroup, j);
-                    float p = computePScaleFactor(covalent, polarizationGroup, j);
-                    float m = computeMScaleFactor(covalent, j);
-                    computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
-                    data.force += tempForce;
-                }
-            }
-            data.force *= ENERGY_SCALE_FACTOR;
-            atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
        }
        else {
            // This is an off-diagonal tile.

            unsigned int j = y*TILE_SIZE + tgx;
-            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], j, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            localData[threadIdx.x].force = make_real3(0);
+            localData[threadIdx.x].torque = make_real3(0);
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = y*TILE_SIZE+tj;
                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    real tempEnergy;
                    float d = computeDScaleFactor(polarizationGroup, tj);
                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
                    float m = computeMScaleFactor(covalent, tj);
-                    computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
-                    data.force += tempForce;
-                    localData[tbx+tj].force -= tempForce;
-                    energy += tempEnergy;
+                    computeOneInteraction(data, localData[tbx+tj], true, d, p, m, 1, energy);
                }
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
-            data.force *= ENERGY_SCALE_FACTOR;
-            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            data.force *= -ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
            unsigned int offset = x*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
            offset = y*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-
-            // Compute torques.
-
-            data.force = make_real3(0);
-            localData[threadIdx.x].force = make_real3(0);
-            for (j = 0; j < TILE_SIZE; j++) {
-                int atom2 = y*TILE_SIZE+tj;
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    float d = computeDScaleFactor(polarizationGroup, tj);
-                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
-                    float m = computeMScaleFactor(covalent, tj);
-                    computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
-                    data.force += tempForce;
-                    computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
-                    localData[tbx+tj].force += tempForce;
-                }
-                tj = (tj + 1) & (TILE_SIZE - 1);
-            }
-            data.force *= ENERGY_SCALE_FACTOR;
-            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-            offset = x*TILE_SIZE + tgx;
-            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
        }
    }

@@ -272,16 +542,18 @@ extern "C" __global__ void computeElectrostatics(
            // Load atom data for this tile.

            AtomData data;
-            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(data, atom1, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            data.force = make_real3(0);
+            data.torque = make_real3(0);
 #ifdef USE_CUTOFF
            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
            atomIndices[threadIdx.x] = j;
-            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], j, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            localData[threadIdx.x].force = make_real3(0);
+            localData[threadIdx.x].torque = make_real3(0);

            // Compute forces.

@@ -289,21 +561,24 @@ extern "C" __global__ void computeElectrostatics(
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = atomIndices[tbx+tj];
                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    real tempEnergy;
-                    computeOneInteractionF1(data, localData[tbx+tj], 1, 1, 1, tempEnergy, tempForce);
-                    data.force += tempForce;
-                    localData[tbx+tj].force -= tempForce;
-                    energy += tempEnergy;
+                    computeOneInteraction(data, localData[tbx+tj], false, 1, 1, 1, 1, energy);
                }
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
-            data.force *= ENERGY_SCALE_FACTOR;
-            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            data.force *= -ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
+
+            // Write results.
+
            unsigned int offset = x*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
 #ifdef USE_CUTOFF
            offset = atomIndices[threadIdx.x];
 #else
@@ -312,36 +587,9 @@ extern "C" __global__ void computeElectrostatics(
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
-
-            // Compute torques.
-
-            data.force = make_real3(0);
-            localData[threadIdx.x].force = make_real3(0);
-            for (j = 0; j < TILE_SIZE; j++) {
-                int atom2 = y*TILE_SIZE+tj;
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    computeOneInteractionT1(data, localData[tbx+tj], 1, 1, 1, tempForce);
-                    data.force += tempForce;
-                    computeOneInteractionT3(data, localData[tbx+tj], 1, 1, 1, tempForce);
-                    localData[tbx+tj].force += tempForce;
-                }
-                tj = (tj + 1) & (TILE_SIZE - 1);
-            }
-            data.force *= ENERGY_SCALE_FACTOR;
-            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-            offset = x*TILE_SIZE + tgx;
-            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-#ifdef USE_CUTOFF
-            offset = atomIndices[threadIdx.x];
-#else
-            offset = y*TILE_SIZE + tgx;
-#endif
-            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
        }
        pos++;
    }

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
@@ -545,6 +545,9 @@ extern "C" __global__ void computeFixedField(
                    localData[tbx+tj].field += fields[2];
                    localData[tbx+tj].fieldPolar += fields[3];
 #ifdef USE_GK
+                }
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 fields[2];
                    computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
                    data.gkField += fields[0];
                    localData[tbx+tj].gkField += fields[1];
@@ -684,6 +687,9 @@ extern "C" __global__ void computeFixedField(
                    localData[tbx+tj].field += fields[2];
                    localData[tbx+tj].fieldPolar += fields[3];
 #ifdef USE_GK
+                }
+                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                    real3 fields[2];
                    computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
                    data.gkField += fields[0];
                    localData[tbx+tj].gkField += fields[1];