Continuing CUDA implementation of spherical harmonics for multipoles

72ab8864 · peastman · fa21870f · 72ab8864 · 72ab8864 · 72ab8864
Commit 72ab8864 authored Aug 19, 2015 by peastman
5 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1150,6 +1150,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    if (maxInducedIterations > 0) {
        defines["THREAD_BLOCK_SIZE"] = cu.intToString(inducedFieldThreads);
        defines["MAX_PREV_DIIS_DIPOLES"] = cu.intToString(MaxPrevDIISDipoles);
+        defines["USE_MUTUAL_POLARIZATION"] = "1";
        module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
        computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
@@ -1159,10 +1160,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    stringstream electrostaticsSource;
    if (usePME) {
        electrostaticsSource << CudaKernelSources::vectorOps;
+        electrostaticsSource << CudaAmoebaKernelSources::sphericalMultipoles;
        electrostaticsSource << CudaAmoebaKernelSources::pmeMultipoleElectrostatics;
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::pmeElectrostaticPairForce : CudaAmoebaKernelSources::pmeElectrostaticPairForceNoQuadrupoles);
-        electrostaticsSource << "#define APPLY_SCALE\n";
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::pmeElectrostaticPairForce : CudaAmoebaKernelSources::pmeElectrostaticPairForceNoQuadrupoles);
        electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
        if (!useShuffle)
            electrostaticsThreadMemory += 3*elementSize;
@@ -1659,8 +1658,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(),
            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
-            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
-            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
+            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
        void* pmeTransformInducedPotentialArgs[] = {&pmePhidp->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformInducedPotentialArgs, cu.getNumAtoms());

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoles.cu
@@ -10,10 +10,10 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
        sphericalDipoles[offset+2] = molecularDipoles[offset+1]; // y -> Q_11s
        offset = 5*atom;
        sphericalQuadrupoles[offset+0] = -3.0f*(molecularQuadrupoles[offset+0]+molecularQuadrupoles[offset+3]); // zz -> Q_20
-        sphericalQuadrupoles[offset+1] = (2*SQRT(3))*molecularQuadrupoles[offset+2]; // xz -> Q_21c
-        sphericalQuadrupoles[offset+2] = (2*SQRT(3))*molecularQuadrupoles[offset+4]; // yz -> Q_21s
-        sphericalQuadrupoles[offset+3] = SQRT(3)*(molecularQuadrupoles[offset+0]-molecularQuadrupoles[offset+3]); // xx-yy -> Q_22c
-        sphericalQuadrupoles[offset+4] = (2*SQRT(3))*molecularQuadrupoles[offset+1]; // xy -> Q_22s
+        sphericalQuadrupoles[offset+1] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+2]; // xz -> Q_21c
+        sphericalQuadrupoles[offset+2] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+4]; // yz -> Q_21s
+        sphericalQuadrupoles[offset+3] = SQRT((real) 3)*(molecularQuadrupoles[offset+0]-molecularQuadrupoles[offset+3]); // xx-yy -> Q_22c
+        sphericalQuadrupoles[offset+4] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+1]; // xy -> Q_22s
        
        // get coordinates of this atom and the z & x axis atoms
        // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
@@ -236,32 +236,32 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
                sphericalQuadrupole[4] *= -1;
            }
            real rotatedQuadrupole[5] = {0, 0, 0, 0, 0};
-            real sqrtThree = SQRT(3);
-            rotatedQuadrupole[0] += sphericalQuadrupole[0]*0.5f*(3.0f*vectorZ.z*vectorZ.z - 1.0f);
-            rotatedQuadrupole[1] += sphericalQuadrupole[0]*sqrtThree*vectorZ.z*vectorZ.x;
-            rotatedQuadrupole[2] += sphericalQuadrupole[0]*sqrtThree*vectorZ.z*vectorZ.y;
-            rotatedQuadrupole[3] += sphericalQuadrupole[0]*0.5f*sqrtThree*(vectorZ.x*vectorZ.x - vectorZ.y*vectorZ.y);
-            rotatedQuadrupole[4] += sphericalQuadrupole[0]*sqrtThree*vectorZ.x*vectorZ.y;
-            rotatedQuadrupole[0] += sphericalQuadrupole[1]*sqrtThree*vectorZ.z*vectorX.z;
-            rotatedQuadrupole[1] += sphericalQuadrupole[1]*(vectorZ.x*vectorX.z + vectorZ.z*vectorX.x);
-            rotatedQuadrupole[2] += sphericalQuadrupole[1]*(vectorZ.y*vectorX.z + vectorZ.z*vectorX.y);
-            rotatedQuadrupole[3] += sphericalQuadrupole[1]*(vectorZ.x*vectorX.x - vectorZ.y*vectorX.y);
-            rotatedQuadrupole[4] += sphericalQuadrupole[1]*(vectorZ.y*vectorX.x + vectorZ.x*vectorX.y);
-            rotatedQuadrupole[0] += sphericalQuadrupole[2]*sqrtThree*vectorZ.z*vectorY.z;
-            rotatedQuadrupole[1] += sphericalQuadrupole[2]*(vectorZ.x*vectorY.z + vectorZ.z*vectorY.x);
-            rotatedQuadrupole[2] += sphericalQuadrupole[2]*(vectorZ.y*vectorY.z + vectorZ.z*vectorY.y);
-            rotatedQuadrupole[3] += sphericalQuadrupole[2]*(vectorZ.x*vectorY.x - vectorZ.y*vectorY.y);
-            rotatedQuadrupole[4] += sphericalQuadrupole[2]*(vectorZ.y*vectorY.x + vectorZ.x*vectorY.y);
-            rotatedQuadrupole[0] += sphericalQuadrupole[3]*0.5f*sqrtThree*(vectorX.z*vectorX.z - vectorY.z*vectorY.z);
-            rotatedQuadrupole[1] += sphericalQuadrupole[3]*(vectorX.z*vectorX.x - vectorY.z*vectorY.x);
-            rotatedQuadrupole[2] += sphericalQuadrupole[3]*(vectorX.z*vectorX.y - vectorY.z*vectorY.y);
-            rotatedQuadrupole[3] += sphericalQuadrupole[3]*0.5f*(vectorX.x*vectorX.x - vectorX.y*vectorX.y - vectorY.x*vectorY.x + vectorY.y*vectorY.y);
-            rotatedQuadrupole[4] += sphericalQuadrupole[3]*(vectorX.x*vectorX.y - vectorY.x*vectorY.y);
-            rotatedQuadrupole[0] += sphericalQuadrupole[4]*sqrtThree*vectorX.z*vectorY.z;
-            rotatedQuadrupole[1] += sphericalQuadrupole[4]*(vectorX.x*vectorY.z + vectorX.z*vectorY.x);
-            rotatedQuadrupole[2] += sphericalQuadrupole[4]*(vectorX.y*vectorY.z + vectorX.z*vectorY.y);
-            rotatedQuadrupole[3] += sphericalQuadrupole[4]*(vectorX.x*vectorY.x - vectorX.y*vectorY.y);
-            rotatedQuadrupole[4] += sphericalQuadrupole[4]*(vectorX.y*vectorY.x + vectorX.x*vectorY.y);
+            real sqrtThree = SQRT((real) 3);
+            rotatedQuadrupole[0] += sphericalQuadrupole[0]*0.5f*(3.0f*vectorZ.z*vectorZ.z - 1.0f) +
+                                    sphericalQuadrupole[1]*sqrtThree*vectorZ.z*vectorX.z +
+                                    sphericalQuadrupole[2]*sqrtThree*vectorZ.z*vectorY.z +
+                                    sphericalQuadrupole[3]*0.5f*sqrtThree*(vectorX.z*vectorX.z - vectorY.z*vectorY.z) +
+                                    sphericalQuadrupole[4]*sqrtThree*vectorX.z*vectorY.z;
+            rotatedQuadrupole[1] += sphericalQuadrupole[0]*sqrtThree*vectorZ.z*vectorZ.x +
+                                    sphericalQuadrupole[1]*(vectorZ.x*vectorX.z + vectorZ.z*vectorX.x) +
+                                    sphericalQuadrupole[2]*(vectorZ.x*vectorY.z + vectorZ.z*vectorY.x) +
+                                    sphericalQuadrupole[3]*(vectorX.z*vectorX.x - vectorY.z*vectorY.x) +
+                                    sphericalQuadrupole[4]*(vectorX.x*vectorY.z + vectorX.z*vectorY.x);
+            rotatedQuadrupole[2] += sphericalQuadrupole[0]*sqrtThree*vectorZ.z*vectorZ.y +
+                                    sphericalQuadrupole[1]*(vectorZ.y*vectorX.z + vectorZ.z*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.y*vectorY.z + vectorZ.z*vectorY.y) +
+                                    sphericalQuadrupole[3]*(vectorX.z*vectorX.y - vectorY.z*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.y*vectorY.z + vectorX.z*vectorY.y);
+            rotatedQuadrupole[3] += sphericalQuadrupole[0]*0.5f*sqrtThree*(vectorZ.x*vectorZ.x - vectorZ.y*vectorZ.y) +
+                                    sphericalQuadrupole[1]*(vectorZ.x*vectorX.x - vectorZ.y*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.x*vectorY.x - vectorZ.y*vectorY.y) +
+                                    sphericalQuadrupole[3]*0.5f*(vectorX.x*vectorX.x - vectorX.y*vectorX.y - vectorY.x*vectorY.x + vectorY.y*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.x*vectorY.x - vectorX.y*vectorY.y);
+            rotatedQuadrupole[4] += sphericalQuadrupole[0]*sqrtThree*vectorZ.x*vectorZ.y +
+                                    sphericalQuadrupole[1]*(vectorZ.y*vectorX.x + vectorZ.x*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.y*vectorY.x + vectorZ.x*vectorY.y) +
+                                    sphericalQuadrupole[3]*(vectorX.x*vectorX.y - vectorY.x*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.y*vectorY.x + vectorX.x*vectorY.y);
            sphericalQuadrupoles[offset] = rotatedQuadrupole[0];
            sphericalQuadrupoles[offset+1] = rotatedQuadrupole[1];
            sphericalQuadrupoles[offset+2] = rotatedQuadrupole[2];

--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

 typedef struct {
-    real3 pos, force, torque, dipole, inducedDipole, inducedDipolePolar;
+    real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
    real q;
    float thole, damp;
 #ifdef INCLUDE_QUADRUPOLES
-    real quadrupoleXX, quadrupoleXY, quadrupoleXZ, quadrupoleYY, quadrupoleYZ;
-    float padding;
+    real sphericalQuadrupole[5];
 #endif
 } AtomData;

-__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor, float dScale, float pScale, float mScale, real3& force, real& energy);
-__device__ void computeOneInteractionF2(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor, float dScale, float pScale, float mScale, real3& force, real& energy);
-__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn, float dScale, float pScale, float mScale);
-__device__ void computeOneInteractionT2(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn, float dScale, float pScale, float mScale);
-__device__ void computeOneInteractionF1NoScale(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor, real3& force, real& energy);
-__device__ void computeOneInteractionF2NoScale(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor, real3& force, real& energy);
-__device__ void computeOneInteractionT1NoScale(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn);
-__device__ void computeOneInteractionT2NoScale(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn);
-
-inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
-        const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ sphericalDipole,
+            const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar,
+            const float2* __restrict__ dampingAndThole) {
    real4 atomPosq = posq[atom];
    data.pos = make_real3(atomPosq.x, atomPosq.y, atomPosq.z);
    data.q = atomPosq.w;
-    data.dipole.x = labFrameDipole[atom*3];
-    data.dipole.y = labFrameDipole[atom*3+1];
-    data.dipole.z = labFrameDipole[atom*3+2];
+    data.sphericalDipole.x = sphericalDipole[atom*3];
+    data.sphericalDipole.y = sphericalDipole[atom*3+1];
+    data.sphericalDipole.z = sphericalDipole[atom*3+2];
 #ifdef INCLUDE_QUADRUPOLES
-    data.quadrupoleXX = labFrameQuadrupole[atom*5];
-    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
-    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
-    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
-    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
+    data.sphericalQuadrupole[0] = sphericalQuadrupole[atom*5];
+    data.sphericalQuadrupole[1] = sphericalQuadrupole[atom*5+1];
+    data.sphericalQuadrupole[2] = sphericalQuadrupole[atom*5+2];
+    data.sphericalQuadrupole[3] = sphericalQuadrupole[atom*5+3];
+    data.sphericalQuadrupole[4] = sphericalQuadrupole[atom*5+4];
 #endif
    data.inducedDipole.x = inducedDipole[atom*3];
    data.inducedDipole.y = inducedDipole[atom*3+1];
@@ -66,7 +57,7 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr

 __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor,
                                      real& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 delta;
+    real3 delta;
    delta.x = atom2.pos.x - atom1.pos.x;
    delta.y = atom2.pos.y - atom1.pos.y;
    delta.z = atom2.pos.z - atom1.pos.z;
@@ -74,113 +65,405 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
    // periodic box

    APPLY_PERIODIC_TO_DELTA(delta)
-    delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-    if (delta.w > CUTOFF_SQUARED)
+    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+    if (r2 > CUTOFF_SQUARED)
        return;

-    real r = SQRT(delta.w);
-    real ralpha = EWALD_ALPHA*r;
+    real rInv = RSQRT(r2);
+    real r = r2*rInv;
+
+
+    real qiRotationMatrix[3][3];
+    buildQIRotationMatrix(delta, rInv, qiRotationMatrix);
+
+    
+    real3 qiUindI = 0.5f*make_real3(qiRotationMatrix[0][1]*atom1.inducedDipole.x + qiRotationMatrix[0][2]*atom1.inducedDipole.y + qiRotationMatrix[0][0]*atom1.inducedDipole.z,
+                                    qiRotationMatrix[1][1]*atom1.inducedDipole.x + qiRotationMatrix[1][2]*atom1.inducedDipole.y + qiRotationMatrix[1][0]*atom1.inducedDipole.z,
+                                    qiRotationMatrix[2][1]*atom1.inducedDipole.x + qiRotationMatrix[2][2]*atom1.inducedDipole.y + qiRotationMatrix[2][0]*atom1.inducedDipole.z);
+    real3 qiUindJ = 0.5f*make_real3(qiRotationMatrix[0][1]*atom2.inducedDipole.x + qiRotationMatrix[0][2]*atom2.inducedDipole.y + qiRotationMatrix[0][0]*atom2.inducedDipole.z,
+                                    qiRotationMatrix[1][1]*atom2.inducedDipole.x + qiRotationMatrix[1][2]*atom2.inducedDipole.y + qiRotationMatrix[1][0]*atom2.inducedDipole.z,
+                                    qiRotationMatrix[2][1]*atom2.inducedDipole.x + qiRotationMatrix[2][2]*atom2.inducedDipole.y + qiRotationMatrix[2][0]*atom2.inducedDipole.z);
+    real3 qiUinpI = 0.5f*make_real3(qiRotationMatrix[0][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[0][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[0][0]*atom1.inducedDipolePolar.z,
+                                    qiRotationMatrix[1][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[1][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[1][0]*atom1.inducedDipolePolar.z,
+                                    qiRotationMatrix[2][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[2][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[2][0]*atom1.inducedDipolePolar.z);
+    real3 qiUinpJ = 0.5f*make_real3(qiRotationMatrix[0][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[0][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[0][0]*atom2.inducedDipolePolar.z,
+                                    qiRotationMatrix[1][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[1][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[1][0]*atom2.inducedDipolePolar.z,
+                                    qiRotationMatrix[2][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[2][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[2][0]*atom2.inducedDipolePolar.z);
+    
+    real3 rotatedDipole1 = rotateDipole(atom1.sphericalDipole, qiRotationMatrix);
+    real3 rotatedDipole2 = rotateDipole(atom2.sphericalDipole, qiRotationMatrix);
+    real rotatedQuadrupole1[] = {0, 0, 0, 0, 0};
+    real rotatedQuadrupole2[] = {0, 0, 0, 0, 0};
+#ifdef INCLUDE_QUADRUPOLES
+    rotateQuadupoles(qiRotationMatrix, atom1.sphericalQuadrupole, atom2.sphericalQuadrupole, rotatedQuadrupole1, rotatedQuadrupole2);
+#endif    
+    
+
+
+    real qiQI[9] = {atom1.q, rotatedDipole1.x, rotatedDipole1.y, rotatedDipole1.z, rotatedQuadrupole1[0], rotatedQuadrupole1[1], rotatedQuadrupole1[2], rotatedQuadrupole1[3], rotatedQuadrupole1[4]};
+    real qiQJ[9] = {atom2.q, rotatedDipole2.x, rotatedDipole2.y, rotatedDipole2.z, rotatedQuadrupole2[0], rotatedQuadrupole2[1], rotatedQuadrupole2[2], rotatedQuadrupole2[3], rotatedQuadrupole2[4]};
+
+
+
+
+
+
+
+
+    // The Qtilde{x,y,z} torque intermediates for atoms I and J, which are used to obtain the torques on the permanent moments.
+    const real sqrtThree = SQRT((real) 3);
+    real qiQIX[9] = {0, qiQI[3], 0, -qiQI[1], sqrtThree*qiQI[6], qiQI[8], -sqrtThree*qiQI[4] - qiQI[7], qiQI[6], -qiQI[5]};
+    real qiQIY[9] = {0, -qiQI[2], qiQI[1], 0, -sqrtThree*qiQI[5], sqrtThree*qiQI[4] - qiQI[7], -qiQI[8], qiQI[5], qiQI[6]};
+    real qiQIZ[9] = {0, 0, -qiQI[3], qiQI[2], 0, -qiQI[6], qiQI[5], -2*qiQI[8], 2*qiQI[7]};
+    real qiQJX[9] = {0, qiQJ[3], 0, -qiQJ[1], sqrtThree*qiQJ[6], qiQJ[8], -sqrtThree*qiQJ[4] - qiQJ[7], qiQJ[6], -qiQJ[5]};
+    real qiQJY[9] = {0, -qiQJ[2], qiQJ[1], 0, -sqrtThree*qiQJ[5], sqrtThree*qiQJ[4] - qiQJ[7], -qiQJ[8], qiQJ[5], qiQJ[6]};
+    real qiQJZ[9] = {0, 0, -qiQJ[3], qiQJ[2], 0, -qiQJ[6], qiQJ[5], -2*qiQJ[8], 2*qiQJ[7]};

-    real alsq2 = 2*EWALD_ALPHA*EWALD_ALPHA;
-    real alsq2n = 0;
-    if (EWALD_ALPHA > 0)
-        alsq2n = RECIP(SQRT_PI*EWALD_ALPHA);
+    // The field derivatives at I due to permanent and induced moments on J, and vice-versa.
+    // Also, their derivatives w.r.t. R, which are needed for force calculations
+    real Vij[9], Vji[9], VjiR[9], VijR[9];
+    // The field derivatives at I due to only permanent moments on J, and vice-versa.
+    real Vijp[3], Vijd[3], Vjip[3], Vjid[3];
+    real rInvVec[7], alphaRVec[8], bVec[5];
+
+    // The rInvVec array is defined such that the ith element is R^-i, with the
+    // dieleectric constant folded in, to avoid conversions later.
+    rInvVec[1] = rInv;
+    for(int i = 2; i < 7; ++i)
+        rInvVec[i] = rInvVec[i-1] * rInv;
+
+    // The alpharVec array is defined such that the ith element is (alpha R)^i,
+    // where kappa (alpha in OpenMM parlance) is the Ewald attenuation parameter.
+    real ralpha = EWALD_ALPHA*r;
    real exp2a = EXP(-(ralpha*ralpha));

-    real rr1 = RECIP(r);
-    delta.w = rr1;
 #ifdef USE_DOUBLE_PRECISION
-    const real erfcAlphaR = erfc(ralpha);
+    const real erfAlphaR = erf(ralpha);
 #else
    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
    // error of 1.5e-7.

    const real t = RECIP(1.0f+0.3275911f*ralpha);
-    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*exp2a;
+    const real erfAlphaR = 1-(0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*exp2a;
 #endif
-    real bn0 = erfcAlphaR*rr1;
-    energy += forceFactor*atom1.q*atom2.q*bn0;
-    real rr2 = rr1*rr1;
-    alsq2n *= alsq2;
-
-    real4 bn;
-    bn.x = (bn0+alsq2n*exp2a)*rr2;
-
-    alsq2n *= alsq2;
-    bn.y = (3*bn.x+alsq2n*exp2a)*rr2;

-    alsq2n *= alsq2;
-    bn.z = (5*bn.y+alsq2n*exp2a)*rr2;

-    alsq2n *= alsq2;
-    bn.w = (7*bn.z+alsq2n*exp2a)*rr2;

-    alsq2n *= alsq2;
-    real bn5 = (9*bn.w+alsq2n*exp2a)*rr2;
+    alphaRVec[1] = ralpha;
+    for (int i = 2; i < 8; ++i)
+        alphaRVec[i] = alphaRVec[i-1]*ralpha;

-    real3 force;
+    real X = 2*EXP(-alphaRVec[2])/SQRT_PI;
+    real uScale = 1;                                   // Delete!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1

-    if (hasExclusions) {
-        computeOneInteractionF1(atom1, atom2, delta, bn, bn5, forceFactor, dScale, pScale, mScale, force, energy);
-        computeOneInteractionF2(atom1, atom2, delta, bn, forceFactor, dScale, pScale, mScale, force, energy);
-    }
-    else {
-        computeOneInteractionF1NoScale(atom1, atom2, delta, bn, bn5, forceFactor, force, energy);
-        computeOneInteractionF2NoScale(atom1, atom2, delta, bn, forceFactor, force, energy);
+    int doubleFactorial = 1, facCount = 1;
+    real tmp = alphaRVec[1];
+    bVec[1] = -erfAlphaR;
+    for(int i=2; i < 5; ++i){
+        bVec[i] = bVec[i-1] + tmp * X / (real)(doubleFactorial);
+        facCount = facCount + 2;
+        doubleFactorial = doubleFactorial * facCount;
+        tmp *= 2*alphaRVec[2];
    }

-    atom1.force += force;
-    if (forceFactor == 1)
-        atom2.force -= force;
-    
-    if (hasExclusions) {
-        computeOneInteractionT1(atom1, atom2, delta, bn, dScale, pScale, mScale);
-        computeOneInteractionT2(atom1, atom2, delta, bn, dScale, pScale, mScale);
-    }
-    else {
-        computeOneInteractionT1NoScale(atom1, atom2, delta, bn);
-        computeOneInteractionT2NoScale(atom1, atom2, delta, bn);
+    real dmp = atom1.damp*atom2.damp;
+    real a = min(atom1.thole, atom2.thole);
+    real u = fabs(dmp) > 1.0e-5f ? r/dmp : 1e10f;
+    real au3 = a*u*u*u;
+    real expau3 = au3 < 50 ? EXP(-au3) : 0;
+    real a2u6 = au3*au3;
+    real a3u9 = a2u6*au3;
+    // Thole damping factors for energies
+    real thole_c  = 1 - expau3;
+    real thole_d0 = 1 - expau3*(1 + 1.5f*au3);
+    real thole_d1 = 1 - expau3;
+    real thole_q0 = 1 - expau3*(1 + au3 + a2u6);
+    real thole_q1 = 1 - expau3*(1 + au3);
+    // Thole damping factors for derivatives
+    real dthole_c  = 1 - expau3*(1 + 1.5f*au3);
+    real dthole_d0 = 1 - expau3*(1 + au3 + 1.5f*a2u6);
+    real dthole_d1 = 1 - expau3*(1 + au3);
+    real dthole_q0 = 1 - expau3*(1 + au3 + 0.25*a2u6 + 0.75*a3u9);
+    real dthole_q1 = 1 - expau3*(1 + au3 + 0.75*a2u6);
+
+    // Now we compute the (attenuated) Coulomb operator and its derivatives, contracted with
+    // permanent moments and induced dipoles.  Note that the coefficient of the permanent force
+    // terms is half of the expected value; this is because we compute the interaction of I with
+    // the sum of induced and permanent moments on J, as well as the interaction of J with I's
+    // permanent and induced moments; doing so double counts the permanent-permanent interaction.
+    real ePermCoef, dPermCoef, eUIndCoef, dUIndCoef, eUInpCoef, dUInpCoef;
+
+    // C-C terms (m=0)
+    ePermCoef = rInvVec[1]*(mScale + bVec[2] - alphaRVec[1]*X);
+    dPermCoef = -0.5f*(mScale + bVec[2])*rInvVec[2];
+    Vij[0]  = ePermCoef*qiQJ[0];
+    Vji[0]  = ePermCoef*qiQI[0];
+    VijR[0] = dPermCoef*qiQJ[0];
+    VjiR[0] = dPermCoef*qiQI[0];
+
+    // C-D and C-Uind terms (m=0)
+    ePermCoef = rInvVec[2]*(mScale + bVec[2]);
+    eUIndCoef = rInvVec[2]*(dScale*thole_c + bVec[2]);
+    eUInpCoef = rInvVec[2]*(pScale*thole_c + bVec[2]);
+    dPermCoef = -rInvVec[3]*(mScale + bVec[2] + alphaRVec[3]*X);
+    dUIndCoef = -2*rInvVec[3]*(dScale*dthole_c + bVec[2] + alphaRVec[3]*X);
+    dUInpCoef = -2*rInvVec[3]*(pScale*dthole_c + bVec[2] + alphaRVec[3]*X);
+    Vij[0]  += -(ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x);
+    Vji[1]   = -(ePermCoef*qiQI[0]);
+    VijR[0] += -(dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x);
+    VjiR[1]  = -(dPermCoef*qiQI[0]);
+    Vjip[0]  = -(eUInpCoef*qiQI[0]);
+    Vjid[0]  = -(eUIndCoef*qiQI[0]);
+    // D-C and Uind-C terms (m=0)
+    Vij[1]   = ePermCoef*qiQJ[0];
+    Vji[0]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1]  = dPermCoef*qiQJ[0];
+    VjiR[0] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0]  = eUInpCoef*qiQJ[0];
+    Vijd[0]  = eUIndCoef*qiQJ[0];
+
+    // D-D and D-Uind terms (m=0)
+    const real twoThirds = (real) 2/3;
+    ePermCoef = -twoThirds*rInvVec[3]*(3*(mScale + bVec[3]) + alphaRVec[3]*X);
+    eUIndCoef = -twoThirds*rInvVec[3]*(3*(dScale*thole_d0 + bVec[3]) + alphaRVec[3]*X);
+    eUInpCoef = -twoThirds*rInvVec[3]*(3*(pScale*thole_d0 + bVec[3]) + alphaRVec[3]*X);
+    dPermCoef = rInvVec[4]*(3*(mScale + bVec[3]) + 2*alphaRVec[5]*X);
+    dUIndCoef = rInvVec[4]*(6*(dScale*dthole_d0 + bVec[3]) + 4*alphaRVec[5]*X);
+    dUInpCoef = rInvVec[4]*(6*(pScale*dthole_d0 + bVec[3]) + 4*alphaRVec[5]*X);
+    Vij[1]  += ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x;
+    Vji[1]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1] += dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x;
+    VjiR[1] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0] += eUInpCoef*qiQJ[1];
+    Vijd[0] += eUIndCoef*qiQJ[1];
+    Vjip[0] += eUInpCoef*qiQI[1];
+    Vjid[0] += eUIndCoef*qiQI[1];
+    // D-D and D-Uind terms (m=1)
+    ePermCoef = rInvVec[3]*(mScale + bVec[3] - twoThirds*alphaRVec[3]*X);
+    eUIndCoef = rInvVec[3]*(dScale*thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+    eUInpCoef = rInvVec[3]*(pScale*thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+    dPermCoef = -1.5f*rInvVec[4]*(mScale + bVec[3]);
+    dUIndCoef = -3*rInvVec[4]*(dScale*dthole_d1 + bVec[3]);
+    dUInpCoef = -3*rInvVec[4]*(pScale*dthole_d1 + bVec[3]);
+    Vij[2]  = ePermCoef*qiQJ[2] + eUIndCoef*qiUindJ.y + eUInpCoef*qiUinpJ.y;
+    Vji[2]  = ePermCoef*qiQI[2] + eUIndCoef*qiUindI.y + eUInpCoef*qiUinpI.y;
+    VijR[2] = dPermCoef*qiQJ[2] + dUIndCoef*qiUindJ.y + dUInpCoef*qiUinpJ.y;
+    VjiR[2] = dPermCoef*qiQI[2] + dUIndCoef*qiUindI.y + dUInpCoef*qiUinpI.y;
+    Vij[3]  = ePermCoef*qiQJ[3] + eUIndCoef*qiUindJ.z + eUInpCoef*qiUinpJ.z;
+    Vji[3]  = ePermCoef*qiQI[3] + eUIndCoef*qiUindI.z + eUInpCoef*qiUinpI.z;
+    VijR[3] = dPermCoef*qiQJ[3] + dUIndCoef*qiUindJ.z + dUInpCoef*qiUinpJ.z;
+    VjiR[3] = dPermCoef*qiQI[3] + dUIndCoef*qiUindI.z + dUInpCoef*qiUinpI.z;
+    Vijp[1] = eUInpCoef*qiQJ[2];
+    Vijd[1] = eUIndCoef*qiQJ[2];
+    Vjip[1] = eUInpCoef*qiQI[2];
+    Vjid[1] = eUIndCoef*qiQI[2];
+    Vijp[2] = eUInpCoef*qiQJ[3];
+    Vijd[2] = eUIndCoef*qiQJ[3];
+    Vjip[2] = eUInpCoef*qiQI[3];
+    Vjid[2] = eUIndCoef*qiQI[3];
+
+    // C-Q terms (m=0)
+    ePermCoef = (mScale + bVec[3])*rInvVec[3];
+    dPermCoef = -((real) 1/3)*rInvVec[4]*(4.5*(mScale + bVec[3]) + 2*alphaRVec[5]*X);
+    Vij[0]  += ePermCoef*qiQJ[4];
+    Vji[4]   = ePermCoef*qiQI[0];
+    VijR[0] += dPermCoef*qiQJ[4];
+    VjiR[4]  = dPermCoef*qiQI[0];
+    // Q-C terms (m=0)
+    Vij[4]   = ePermCoef*qiQJ[0];
+    Vji[0]  += ePermCoef*qiQI[4];
+    VijR[4]  = dPermCoef*qiQJ[0];
+    VjiR[0] += dPermCoef*qiQI[4];
+
+    // D-Q and Uind-Q terms (m=0)
+    const real fourThirds = (real) 4/3;
+    ePermCoef = rInvVec[4]*(3*(mScale + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    eUIndCoef = rInvVec[4]*(3*(dScale*thole_q0 + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    eUInpCoef = rInvVec[4]*(3*(pScale*thole_q0 + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    dPermCoef = -fourThirds*rInvVec[5]*(4.5f*(mScale + bVec[3]) + (1 + alphaRVec[2])*alphaRVec[5]*X);
+    dUIndCoef = -fourThirds*rInvVec[5]*(9*(dScale*dthole_q0 + bVec[3]) + 2*(1 + alphaRVec[2])*alphaRVec[5]*X);
+    dUInpCoef = -fourThirds*rInvVec[5]*(9*(pScale*dthole_q0 + bVec[3]) + 2*(1 + alphaRVec[2])*alphaRVec[5]*X);
+    Vij[1]  += ePermCoef*qiQJ[4];
+    Vji[4]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1] += dPermCoef*qiQJ[4];
+    VjiR[4] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0] += eUInpCoef*qiQJ[4];
+    Vijd[0] += eUIndCoef*qiQJ[4];
+    // Q-D and Q-Uind terms (m=0)
+    Vij[4]  += -(ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x);
+    Vji[1]  += -(ePermCoef*qiQI[4]);
+    VijR[4] += -(dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x);
+    VjiR[1] += -(dPermCoef*qiQI[4]);
+    Vjip[0] += -(eUInpCoef*qiQI[4]);
+    Vjid[0] += -(eUIndCoef*qiQI[4]);
+
+    // D-Q and Uind-Q terms (m=1)
+    ePermCoef = -sqrtThree*rInvVec[4]*(mScale + bVec[3]);
+    eUIndCoef = -sqrtThree*rInvVec[4]*(dScale*thole_q1 + bVec[3]);
+    eUInpCoef = -sqrtThree*rInvVec[4]*(pScale*thole_q1 + bVec[3]);
+    const real fourSqrtOneThird = 4/sqrt((real) 3);
+    dPermCoef = fourSqrtOneThird*rInvVec[5]*(1.5f*(mScale + bVec[3]) + 0.5f*alphaRVec[5]*X);
+    dUIndCoef = fourSqrtOneThird*rInvVec[5]*(3*(dScale*dthole_q1 + bVec[3]) + alphaRVec[5]*X);
+    dUInpCoef = fourSqrtOneThird*rInvVec[5]*(3*(pScale*dthole_q1 + bVec[3]) + alphaRVec[5]*X);
+    Vij[2]  += ePermCoef*qiQJ[5];
+    Vji[5]   = ePermCoef*qiQI[2] + eUIndCoef*qiUindI.y + eUInpCoef*qiUinpI.y;
+    VijR[2] += dPermCoef*qiQJ[5];
+    VjiR[5]  = dPermCoef*qiQI[2] + dUIndCoef*qiUindI.y + dUInpCoef*qiUinpI.y;
+    Vij[3]  += ePermCoef*qiQJ[6];
+    Vji[6]   = ePermCoef*qiQI[3] + eUIndCoef*qiUindI.z + eUInpCoef*qiUinpI.z;
+    VijR[3] += dPermCoef*qiQJ[6];
+    VjiR[6]  = dPermCoef*qiQI[3] + dUIndCoef*qiUindI.z + dUInpCoef*qiUinpI.z;
+    Vijp[1] += eUInpCoef*qiQJ[5];
+    Vijd[1] += eUIndCoef*qiQJ[5];
+    Vijp[2] += eUInpCoef*qiQJ[6];
+    Vijd[2] += eUIndCoef*qiQJ[6];
+    // D-Q and Uind-Q terms (m=1)
+    Vij[5]   = -(ePermCoef*qiQJ[2] + eUIndCoef*qiUindJ.y + eUInpCoef*qiUinpJ.y);
+    Vji[2]  += -(ePermCoef*qiQI[5]);
+    VijR[5]  = -(dPermCoef*qiQJ[2] + dUIndCoef*qiUindJ.y + dUInpCoef*qiUinpJ.y);
+    VjiR[2] += -(dPermCoef*qiQI[5]);
+    Vij[6]   = -(ePermCoef*qiQJ[3] + eUIndCoef*qiUindJ.z + eUInpCoef*qiUinpJ.z);
+    Vji[3]  += -(ePermCoef*qiQI[6]);
+    VijR[6]  = -(dPermCoef*qiQJ[3] + dUIndCoef*qiUindJ.z + dUInpCoef*qiUinpJ.z);
+    VjiR[3] += -(dPermCoef*qiQI[6]);
+    Vjip[1] += -(eUInpCoef*qiQI[5]);
+    Vjid[1] += -(eUIndCoef*qiQI[5]);
+    Vjip[2] += -(eUInpCoef*qiQI[6]);
+    Vjid[2] += -(eUIndCoef*qiQI[6]);
+
+    // Q-Q terms (m=0)
+    ePermCoef = rInvVec[5]*(6*(mScale + bVec[4]) + ((real) 4/45)*(-3 + 10*alphaRVec[2])*alphaRVec[5]*X);
+    dPermCoef = -rInvVec[6]*(135*(mScale + bVec[4]) + 4*(1 + 2*alphaRVec[2])*alphaRVec[7]*X)/9;
+    Vij[4]  += ePermCoef*qiQJ[4];
+    Vji[4]  += ePermCoef*qiQI[4];
+    VijR[4] += dPermCoef*qiQJ[4];
+    VjiR[4] += dPermCoef*qiQI[4];
+    // Q-Q terms (m=1)
+    const real fourOverFifteen = (real) 4/15;
+    ePermCoef = -fourOverFifteen*rInvVec[5]*(15*(mScale + bVec[4]) + alphaRVec[5]*X);
+    dPermCoef = rInvVec[6]*(10*(mScale + bVec[4]) + fourThirds*alphaRVec[7]*X);
+    Vij[5]  += ePermCoef*qiQJ[5];
+    Vji[5]  += ePermCoef*qiQI[5];
+    VijR[5] += dPermCoef*qiQJ[5];
+    VjiR[5] += dPermCoef*qiQI[5];
+    Vij[6]  += ePermCoef*qiQJ[6];
+    Vji[6]  += ePermCoef*qiQI[6];
+    VijR[6] += dPermCoef*qiQJ[6];
+    VjiR[6] += dPermCoef*qiQI[6];
+    // Q-Q terms (m=2)
+    ePermCoef = rInvVec[5]*(mScale + bVec[4] - fourOverFifteen*alphaRVec[5]*X);
+    dPermCoef = -2.5*(mScale + bVec[4])*rInvVec[6];
+    Vij[7]  = ePermCoef*qiQJ[7];
+    Vji[7]  = ePermCoef*qiQI[7];
+    VijR[7] = dPermCoef*qiQJ[7];
+    VjiR[7] = dPermCoef*qiQI[7];
+    Vij[8]  = ePermCoef*qiQJ[8];
+    Vji[8]  = ePermCoef*qiQI[8];
+    VijR[8] = dPermCoef*qiQJ[8];
+    VjiR[8] = dPermCoef*qiQI[8];
+
+    // Evaluate the energies, forces and torques due to permanent+induced moments
+    // interacting with just the permanent moments.
+    energy += forceFactor*0.5f*(qiQI[0]*Vij[0] + qiQJ[0]*Vji[0]);
+    real fIZ = qiQI[0]*VijR[0];
+    real fJZ = qiQJ[0]*VjiR[0];
+    real EIX = 0, EIY = 0, EIZ = 0, EJX = 0, EJY = 0, EJZ = 0;
+    for(int i = 1; i < 9; ++i){
+        energy += forceFactor*0.5f*(qiQI[i]*Vij[i] + qiQJ[i]*Vji[i]);
+        fIZ += qiQI[i]*VijR[i];
+        fJZ += qiQJ[i]*VjiR[i];
+        EIX += qiQIX[i]*Vij[i];
+        EIY += qiQIY[i]*Vij[i];
+        EIZ += qiQIZ[i]*Vij[i];
+        EJX += qiQJX[i]*Vji[i];
+        EJY += qiQJY[i]*Vji[i];
+        EJZ += qiQJZ[i]*Vji[i];
    }

+    // Define the torque intermediates for the induced dipoles. These are simply the induced dipole torque
+    // intermediates dotted with the field due to permanent moments only, at each center. We inline the
+    // induced dipole torque intermediates here, for simplicity. N.B. There are no torques on the dipoles
+    // themselves, so we accumulate the torque intermediates into separate variables to allow them to be
+    // used only in the force calculation.
+    //
+    // The torque about the x axis (needed to obtain the y force on the induced dipoles, below)
+    //    qiUindIx[0] = qiQUindI[2];    qiUindIx[1] = 0;    qiUindIx[2] = -qiQUindI[0]
+    real iEIX = qiUinpI.z*Vijp[0] + qiUindI.z*Vijd[0] - qiUinpI.x*Vijp[2] - qiUindI.x*Vijd[2];
+    real iEJX = qiUinpJ.z*Vjip[0] + qiUindJ.z*Vjid[0] - qiUinpJ.x*Vjip[2] - qiUindJ.x*Vjid[2];
+    // The torque about the y axis (needed to obtain the x force on the induced dipoles, below)
+    //    qiUindIy[0] = -qiQUindI[1];   qiUindIy[1] = qiQUindI[0];    qiUindIy[2] = 0
+    real iEIY = qiUinpI.x*Vijp[1] + qiUindI.x*Vijd[1] - qiUinpI.y*Vijp[0] - qiUindI.y*Vijd[0];
+    real iEJY = qiUinpJ.x*Vjip[1] + qiUindJ.x*Vjid[1] - qiUinpJ.y*Vjip[0] - qiUindJ.y*Vjid[0];
+
+#ifdef USE_MUTUAL_POLARIZATION
+    // Uind-Uind terms (m=0)
+    real eCoef = -fourThirds*rInvVec[3]*(3*(uScale*thole_d0 + bVec[3]) + alphaRVec[3]*X);
+    real dCoef = rInvVec[4]*(6*(uScale*dthole_d0 + bVec[3]) + 4*alphaRVec[5]*X);
+    iEIX += eCoef*(qiUinpI.z*qiUindJ.x + qiUindI.z*qiUinpJ.x);
+    iEJX += eCoef*(qiUinpJ.z*qiUindI.x + qiUindJ.z*qiUinpI.x);
+    iEIY -= eCoef*(qiUinpI.y*qiUindJ.x + qiUindI.y*qiUinpJ.x);
+    iEJY -= eCoef*(qiUinpJ.y*qiUindI.x + qiUindJ.y*qiUinpI.x);
+    fIZ += dCoef*(qiUinpI.x*qiUindJ.x + qiUindI.x*qiUinpJ.x);
+    fIZ += dCoef*(qiUinpJ.x*qiUindI.x + qiUindJ.x*qiUinpI.x);
+    // Uind-Uind terms (m=1)
+    eCoef = 2*rInvVec[3]*(uScale*thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+    dCoef = -3*rInvVec[4]*(uScale*dthole_d1 + bVec[3]);
+    iEIX -= eCoef*(qiUinpI.x*qiUindJ.z + qiUindI.x*qiUinpJ.z);
+    iEJX -= eCoef*(qiUinpJ.x*qiUindI.z + qiUindJ.x*qiUinpI.z);
+    iEIY += eCoef*(qiUinpI.x*qiUindJ.y + qiUindI.x*qiUinpJ.y);
+    iEJY += eCoef*(qiUinpJ.x*qiUindI.y + qiUindJ.x*qiUinpI.y);
+    fIZ += dCoef*(qiUinpI.y*qiUindJ.y + qiUindI.y*qiUinpJ.y + qiUinpI.z*qiUindJ.z + qiUindI.z*qiUinpJ.z);
+    fIZ += dCoef*(qiUinpJ.y*qiUindI.y + qiUindJ.y*qiUinpI.y + qiUinpJ.z*qiUindI.z + qiUindJ.z*qiUinpI.z);
+#endif
+
+    // The quasi-internal frame forces and torques.  Note that the induced torque intermediates are
+    // used in the force expression, but not in the torques; the induced dipoles are isotropic.
+    real qiForce[3] = {rInv*(EIY+EJY+iEIY+iEJY), -rInv*(EIX+EJX+iEIX+iEJX), -(fJZ+fIZ)};
+    real qiTorqueI[3] = {-EIX, -EIY, -EIZ};
+    real qiTorqueJ[3] = {-EJX, -EJY, -EJZ};

+
+    real3 force = make_real3(qiRotationMatrix[1][1]*qiForce[0] + qiRotationMatrix[2][1]*qiForce[1] + qiRotationMatrix[0][1]*qiForce[2],
+                             qiRotationMatrix[1][2]*qiForce[0] + qiRotationMatrix[2][2]*qiForce[1] + qiRotationMatrix[0][2]*qiForce[2],
+                             qiRotationMatrix[1][0]*qiForce[0] + qiRotationMatrix[2][0]*qiForce[1] + qiRotationMatrix[0][0]*qiForce[2]);
+    atom1.force += force;
+    atom1.torque += make_real3(qiRotationMatrix[1][1]*qiTorqueI[0] + qiRotationMatrix[2][1]*qiTorqueI[1] + qiRotationMatrix[0][1]*qiTorqueI[2],
+                               qiRotationMatrix[1][2]*qiTorqueI[0] + qiRotationMatrix[2][2]*qiTorqueI[1] + qiRotationMatrix[0][2]*qiTorqueI[2],
+                               qiRotationMatrix[1][0]*qiTorqueI[0] + qiRotationMatrix[2][0]*qiTorqueI[1] + qiRotationMatrix[0][0]*qiTorqueI[2]);
    if (forceFactor == 1) {
-        // T3 == T1 w/ particles I and J reversed
-        // T4 == T2 w/ particles I and J reversed
-
-        delta.x = -delta.x;
-        delta.y = -delta.y;
-        delta.z = -delta.z;
-        if (hasExclusions) {
-            computeOneInteractionT1(atom2, atom1, delta, bn, dScale, pScale, mScale);
-            computeOneInteractionT2(atom2, atom1, delta, bn, dScale, pScale, mScale);
-        }
-        else {
-            computeOneInteractionT1NoScale(atom2, atom1, delta, bn);
-            computeOneInteractionT2NoScale(atom2, atom1, delta, bn);
-        }
+        atom2.force -= force;
+        atom2.torque += make_real3(qiRotationMatrix[1][1]*qiTorqueJ[0] + qiRotationMatrix[2][1]*qiTorqueJ[1] + qiRotationMatrix[0][1]*qiTorqueJ[2],
+                                   qiRotationMatrix[1][2]*qiTorqueJ[0] + qiRotationMatrix[2][2]*qiTorqueJ[1] + qiRotationMatrix[0][2]*qiTorqueJ[2],
+                                   qiRotationMatrix[1][0]*qiTorqueJ[0] + qiRotationMatrix[2][0]*qiTorqueJ[1] + qiRotationMatrix[0][0]*qiTorqueJ[2]);
    }
 }

 /**
 * Compute the self energy and self torque.
 */
-__device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
+__device__ void computeSelfEnergyAndTorque(AtomData& atom1, int atomIndex, real& energy, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole) {
    real term = 2*EWALD_ALPHA*EWALD_ALPHA;
    real fterm = -EWALD_ALPHA/SQRT_PI;
    real cii = atom1.q*atom1.q;
-    real dii = dot(atom1.dipole, atom1.dipole);
+    real3 dipole = make_real3(labFrameDipole[atomIndex*3], labFrameDipole[atomIndex*3+1], labFrameDipole[atomIndex*3+2]);
+    real dii = dot(dipole, dipole);
 #ifdef INCLUDE_QUADRUPOLES
-    real qii = 2*(atom1.quadrupoleXX*atom1.quadrupoleXX +
-                  atom1.quadrupoleYY*atom1.quadrupoleYY +
-                  atom1.quadrupoleXX*atom1.quadrupoleYY +
-                  atom1.quadrupoleXY*atom1.quadrupoleXY +
-                  atom1.quadrupoleXZ*atom1.quadrupoleXZ +
-                  atom1.quadrupoleYZ*atom1.quadrupoleYZ);
+    real quadrupoleXX = labFrameQuadrupole[atomIndex*5];
+    real quadrupoleXY = labFrameQuadrupole[atomIndex*5+1];
+    real quadrupoleXZ = labFrameQuadrupole[atomIndex*5+2];
+    real quadrupoleYY = labFrameQuadrupole[atomIndex*5+3];
+    real quadrupoleYZ = labFrameQuadrupole[atomIndex*5+4];
+    real qii = 2*(quadrupoleXX*quadrupoleXX +
+                  quadrupoleYY*quadrupoleYY +
+                  quadrupoleXX*quadrupoleYY +
+                  quadrupoleXY*quadrupoleXY +
+                  quadrupoleXZ*quadrupoleXZ +
+                  quadrupoleYZ*quadrupoleYZ);
 #else
    real qii = 0;
 #endif
-    real uii = dot(atom1.dipole, atom1.inducedDipole);
+    real uii = dot(dipole, atom1.inducedDipole);
    real selfEnergy = (cii + term*(dii/3 + 2*term*qii/5));
    selfEnergy += term*uii/3;
    selfEnergy *= fterm;
@@ -189,7 +472,7 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
    // self-torque for PME

    real3 ui = atom1.inducedDipole+atom1.inducedDipolePolar;
-    atom1.torque += ((2/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI)*cross(atom1.dipole, ui);
+    atom1.torque += ((2/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI)*cross(dipole, ui);
 }

 /**
@@ -204,8 +487,9 @@ extern "C" __global__ void computeElectrostatics(
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const unsigned int* __restrict__ interactingAtoms,
 #endif
-        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
-        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ sphericalDipole,
+        const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar,
+        const float2* __restrict__ dampingAndThole) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
@@ -223,7 +507,7 @@ extern "C" __global__ void computeElectrostatics(
        const unsigned int y = tileIndices.y;
        AtomData data;
        unsigned int atom1 = x*TILE_SIZE + tgx;
-        loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+        loadAtomData(data, atom1, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
        data.force = make_real3(0);
        data.torque = make_real3(0);
        uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
@@ -233,13 +517,13 @@ extern "C" __global__ void computeElectrostatics(

            localData[threadIdx.x].pos = data.pos;
            localData[threadIdx.x].q = data.q;
-            localData[threadIdx.x].dipole = data.dipole;
+            localData[threadIdx.x].sphericalDipole = data.sphericalDipole;
 #ifdef INCLUDE_QUADRUPOLES
-            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
-            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
-            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
-            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
-            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+            localData[threadIdx.x].sphericalQuadrupole[0] = data.sphericalQuadrupole[0];
+            localData[threadIdx.x].sphericalQuadrupole[1] = data.sphericalQuadrupole[1];
+            localData[threadIdx.x].sphericalQuadrupole[2] = data.sphericalQuadrupole[2];
+            localData[threadIdx.x].sphericalQuadrupole[3] = data.sphericalQuadrupole[3];
+            localData[threadIdx.x].sphericalQuadrupole[4] = data.sphericalQuadrupole[4];
 #endif
            localData[threadIdx.x].inducedDipole = data.inducedDipole;
            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
@@ -258,7 +542,7 @@ extern "C" __global__ void computeElectrostatics(
                }
            }
            if (atom1 < NUM_ATOMS)
-                computeSelfEnergyAndTorque(data, energy);
+                computeSelfEnergyAndTorque(data, atom1, energy, labFrameDipole, labFrameQuadrupole);
            data.force *= -ENERGY_SCALE_FACTOR;
            data.torque *= ENERGY_SCALE_FACTOR;
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
@@ -272,7 +556,7 @@ extern "C" __global__ void computeElectrostatics(
            // This is an off-diagonal tile.

            unsigned int j = y*TILE_SIZE + tgx;
-            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], j, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            localData[threadIdx.x].force = make_real3(0);
            localData[threadIdx.x].torque = make_real3(0);
            unsigned int tj = tgx;
@@ -366,7 +650,7 @@ extern "C" __global__ void computeElectrostatics(
            // Load atom data for this tile.

            AtomData data;
-            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(data, atom1, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            data.force = make_real3(0);
            data.torque = make_real3(0);
 #ifdef USE_CUTOFF
@@ -375,7 +659,7 @@ extern "C" __global__ void computeElectrostatics(
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
            atomIndices[threadIdx.x] = j;
-            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], j, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            localData[threadIdx.x].force = make_real3(0);
            localData[threadIdx.x].torque = make_real3(0);


--- a/plugins/amoeba/platforms/cuda/src/kernels/sphericalMultipoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/sphericalMultipoles.cu
+__device__ void buildQIRotationMatrix(real3 deltaR, real rInv, real (&rotationMatrix)[3][3]) {
+    real3 vectorZ = deltaR*rInv;
+    real3 vectorX = vectorZ;
+    if (deltaR.y != 0 || deltaR.z != 0)
+        vectorX.x += 1;
+    else
+        vectorX.y += 1;
+
+    vectorX -= vectorZ*dot(vectorX, vectorZ);
+    vectorX = normalize(vectorX);
+    real3 vectorY = cross(vectorZ, vectorX);
+
+    // Reorder the Cartesian {x,y,z} dipole rotation matrix, to account
+    // for spherical harmonic ordering {z,x,y}.
+    rotationMatrix[0][0] = vectorZ.z;
+    rotationMatrix[0][1] = vectorZ.x;
+    rotationMatrix[0][2] = vectorZ.y;
+    rotationMatrix[1][0] = vectorX.z;
+    rotationMatrix[1][1] = vectorX.x;
+    rotationMatrix[1][2] = vectorX.y;
+    rotationMatrix[2][0] = vectorY.z;
+    rotationMatrix[2][1] = vectorY.x;
+    rotationMatrix[2][2] = vectorY.y;
+}
+
+__device__ real3 rotateDipole(real3& dipole, const real (&rotationMatrix)[3][3]) {
+    return make_real3(rotationMatrix[0][0]*dipole.x + rotationMatrix[0][1]*dipole.y + rotationMatrix[0][2]*dipole.z,
+                      rotationMatrix[1][0]*dipole.x + rotationMatrix[1][1]*dipole.y + rotationMatrix[1][2]*dipole.z,
+                      rotationMatrix[2][0]*dipole.x + rotationMatrix[2][1]*dipole.y + rotationMatrix[2][2]*dipole.z);
+}
+
+
+__device__ void rotateQuadupoles(const real (&rotationMatrix)[3][3], const real* quad1, const real* quad2, real* rotated1, real* rotated2) {
+    real sqrtThree = SQRT((real) 3);
+    real element;
+    element = 0.5f*(3.0f*rotationMatrix[0][0]*rotationMatrix[0][0] - 1.0f);
+    rotated1[0] += quad1[0]*element;
+    rotated2[0] += quad2[0]*element;
+    element = sqrtThree*rotationMatrix[0][0]*rotationMatrix[0][1];
+    rotated1[0] += quad1[1]*element;
+    rotated2[0] += quad2[1]*element;
+    element = sqrtThree*rotationMatrix[0][0]*rotationMatrix[0][2];
+    rotated1[0] += quad1[2]*element;
+    rotated2[0] += quad2[2]*element;
+    element = 0.5f*sqrtThree*(rotationMatrix[0][1]*rotationMatrix[0][1] - rotationMatrix[0][2]*rotationMatrix[0][2]);
+    rotated1[0] += quad1[3]*element;
+    rotated2[0] += quad2[3]*element;
+    element = sqrtThree*rotationMatrix[0][1]*rotationMatrix[0][2];
+    rotated1[0] += quad1[4]*element;
+    rotated2[0] += quad2[4]*element;
+    element = sqrtThree*rotationMatrix[0][0]*rotationMatrix[1][0];
+    rotated1[1] += quad1[0]*element;
+    rotated2[1] += quad2[0]*element;
+    element = rotationMatrix[1][0]*rotationMatrix[0][1] + rotationMatrix[0][0]*rotationMatrix[1][1];
+    rotated1[1] += quad1[1]*element;
+    rotated2[1] += quad2[1]*element;
+    element = rotationMatrix[1][0]*rotationMatrix[0][2] + rotationMatrix[0][0]*rotationMatrix[1][2];
+    rotated1[1] += quad1[2]*element;
+    rotated2[1] += quad2[2]*element;
+    element = rotationMatrix[0][1]*rotationMatrix[1][1] - rotationMatrix[0][2]*rotationMatrix[1][2];
+    rotated1[1] += quad1[3]*element;
+    rotated2[1] += quad2[3]*element;
+    element = rotationMatrix[1][1]*rotationMatrix[0][2] + rotationMatrix[0][1]*rotationMatrix[1][2];
+    rotated1[1] += quad1[4]*element;
+    rotated2[1] += quad2[4]*element;
+    element = sqrtThree*rotationMatrix[0][0]*rotationMatrix[2][0];
+    rotated1[2] += quad1[0]*element;
+    rotated2[2] += quad2[0]*element;
+    element = rotationMatrix[2][0]*rotationMatrix[0][1] + rotationMatrix[0][0]*rotationMatrix[2][1];
+    rotated1[2] += quad1[1]*element;
+    rotated2[2] += quad2[1]*element;
+    element = rotationMatrix[2][0]*rotationMatrix[0][2] + rotationMatrix[0][0]*rotationMatrix[2][2];
+    rotated1[2] += quad1[2]*element;
+    rotated2[2] += quad2[2]*element;
+    element = rotationMatrix[0][1]*rotationMatrix[2][1] - rotationMatrix[0][2]*rotationMatrix[2][2];
+    rotated1[2] += quad1[3]*element;
+    rotated2[2] += quad2[3]*element;
+    element = rotationMatrix[2][1]*rotationMatrix[0][2] + rotationMatrix[0][1]*rotationMatrix[2][2];
+    rotated1[2] += quad1[4]*element;
+    rotated2[2] += quad2[4]*element;
+    element = 0.5f*sqrtThree*(rotationMatrix[1][0]*rotationMatrix[1][0] - rotationMatrix[2][0]*rotationMatrix[2][0]);
+    rotated1[3] += quad1[0]*element;
+    rotated2[3] += quad2[0]*element;
+    element = rotationMatrix[1][0]*rotationMatrix[1][1] - rotationMatrix[2][0]*rotationMatrix[2][1];
+    rotated1[3] += quad1[1]*element;
+    rotated2[3] += quad2[1]*element;
+    element = rotationMatrix[1][0]*rotationMatrix[1][2] - rotationMatrix[2][0]*rotationMatrix[2][2];
+    rotated1[3] += quad1[2]*element;
+    rotated2[3] += quad2[2]*element;
+    element = 0.5f*(rotationMatrix[1][1]*rotationMatrix[1][1] - rotationMatrix[2][1]*rotationMatrix[2][1] - rotationMatrix[1][2]*rotationMatrix[1][2] + rotationMatrix[2][2]*rotationMatrix[2][2]);
+    rotated1[3] += quad1[3]*element;
+    rotated2[3] += quad2[3]*element;
+    element = rotationMatrix[1][1]*rotationMatrix[1][2] - rotationMatrix[2][1]*rotationMatrix[2][2];
+    rotated1[3] += quad1[4]*element;
+    rotated2[3] += quad2[4]*element;
+    element = sqrtThree*rotationMatrix[1][0]*rotationMatrix[2][0];
+    rotated1[4] += quad1[0]*element;
+    rotated2[4] += quad2[0]*element;
+    element = rotationMatrix[2][0]*rotationMatrix[1][1] + rotationMatrix[1][0]*rotationMatrix[2][1];
+    rotated1[4] += quad1[1]*element;
+    rotated2[4] += quad2[1]*element;
+    element = rotationMatrix[2][0]*rotationMatrix[1][2] + rotationMatrix[1][0]*rotationMatrix[2][2];
+    rotated1[4] += quad1[2]*element;
+    rotated2[4] += quad2[2]*element;
+    element = rotationMatrix[1][1]*rotationMatrix[2][1] - rotationMatrix[1][2]*rotationMatrix[2][2];
+    rotated1[4] += quad1[3]*element;
+    rotated2[4] += quad2[3]*element;
+    element = rotationMatrix[2][1]*rotationMatrix[1][2] + rotationMatrix[1][1]*rotationMatrix[2][2];
+    rotated1[4] += quad1[4]*element;
+    rotated2[4] += quad2[4]*element;
+}
--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
@@ -6676,6 +6676,6 @@ RealOpenMM AmoebaReferencePmeMultipoleForce::calculateElectrostatic(const vector
    energy += computeReciprocalSpaceInducedDipoleForceAndEnergy(getPolarizationType(), particleData, forces, torques);
    energy += computeReciprocalSpaceFixedMultipoleForceAndEnergy(particleData, forces, torques);
    energy += calculatePmeSelfEnergy(particleData);
-
+ 
    return energy;
 }