Python 2/3 compatibility in single code base, plus python 3 testing on travis.

b7088b74 · peastman · Robert McGibbon · 4c00b312 · b7088b74 · b7088b74
Commit b7088b74 authored Aug 10, 2015 by peastman Committed by Robert McGibbon Aug 27, 2015
20 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/sphericalMultipoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/sphericalMultipoles.cu
+__device__ void buildQIRotationMatrix(real3 deltaR, real rInv, real (&rotationMatrix)[3][3]) {
+    real3 vectorZ = deltaR*rInv;
+    real3 vectorX = vectorZ;
+    if (deltaR.y != 0 || deltaR.z != 0)
+        vectorX.x += 1;
+    else
+        vectorX.y += 1;
+    vectorX -= vectorZ*dot(vectorX, vectorZ);
+    vectorX = normalize(vectorX);
+    real3 vectorY = cross(vectorZ, vectorX);
+    // Reorder the Cartesian {x,y,z} dipole rotation matrix, to account
+    // for spherical harmonic ordering {z,x,y}.
+    rotationMatrix[0][0] = vectorZ.z;
+    rotationMatrix[0][1] = vectorZ.x;
+    rotationMatrix[0][2] = vectorZ.y;
+    rotationMatrix[1][0] = vectorX.z;
+    rotationMatrix[1][1] = vectorX.x;
+    rotationMatrix[1][2] = vectorX.y;
+    rotationMatrix[2][0] = vectorY.z;
+    rotationMatrix[2][1] = vectorY.x;
+    rotationMatrix[2][2] = vectorY.y;
+}
+__device__ real3 rotateDipole(real3& dipole, const real (&rotationMatrix)[3][3]) {
+    return make_real3(rotationMatrix[0][0]*dipole.x + rotationMatrix[0][1]*dipole.y + rotationMatrix[0][2]*dipole.z,
+                      rotationMatrix[1][0]*dipole.x + rotationMatrix[1][1]*dipole.y + rotationMatrix[1][2]*dipole.z,
+                      rotationMatrix[2][0]*dipole.x + rotationMatrix[2][1]*dipole.y + rotationMatrix[2][2]*dipole.z);
+}
+__device__ void rotateQuadupoles(const real (&rotationMatrix)[3][3], const real* quad1, const real* quad2, real* rotated1, real* rotated2) {
+    real sqrtThree = SQRT((real) 3);
+    real element;
+    element = 0.5f*(3.0f*rotationMatrix[0][0]*rotationMatrix[0][0] - 1.0f);
+    rotated1[0] += quad1[0]*element;
+    rotated2[0] += quad2[0]*element;
+    element = sqrtThree*rotationMatrix[0][0]*rotationMatrix[0][1];
+    rotated1[0] += quad1[1]*element;
+    rotated2[0] += quad2[1]*element;
+    element = sqrtThree*rotationMatrix[0][0]*rotationMatrix[0][2];
+    rotated1[0] += quad1[2]*element;
+    rotated2[0] += quad2[2]*element;
+    element = 0.5f*sqrtThree*(rotationMatrix[0][1]*rotationMatrix[0][1] - rotationMatrix[0][2]*rotationMatrix[0][2]);
+    rotated1[0] += quad1[3]*element;
+    rotated2[0] += quad2[3]*element;
+    element = sqrtThree*rotationMatrix[0][1]*rotationMatrix[0][2];
+    rotated1[0] += quad1[4]*element;
+    rotated2[0] += quad2[4]*element;
+    element = sqrtThree*rotationMatrix[0][0]*rotationMatrix[1][0];
+    rotated1[1] += quad1[0]*element;
+    rotated2[1] += quad2[0]*element;
+    element = rotationMatrix[1][0]*rotationMatrix[0][1] + rotationMatrix[0][0]*rotationMatrix[1][1];
+    rotated1[1] += quad1[1]*element;
+    rotated2[1] += quad2[1]*element;
+    element = rotationMatrix[1][0]*rotationMatrix[0][2] + rotationMatrix[0][0]*rotationMatrix[1][2];
+    rotated1[1] += quad1[2]*element;
+    rotated2[1] += quad2[2]*element;
+    element = rotationMatrix[0][1]*rotationMatrix[1][1] - rotationMatrix[0][2]*rotationMatrix[1][2];
+    rotated1[1] += quad1[3]*element;
+    rotated2[1] += quad2[3]*element;
+    element = rotationMatrix[1][1]*rotationMatrix[0][2] + rotationMatrix[0][1]*rotationMatrix[1][2];
+    rotated1[1] += quad1[4]*element;
+    rotated2[1] += quad2[4]*element;
+    element = sqrtThree*rotationMatrix[0][0]*rotationMatrix[2][0];
+    rotated1[2] += quad1[0]*element;
+    rotated2[2] += quad2[0]*element;
+    element = rotationMatrix[2][0]*rotationMatrix[0][1] + rotationMatrix[0][0]*rotationMatrix[2][1];
+    rotated1[2] += quad1[1]*element;
+    rotated2[2] += quad2[1]*element;
+    element = rotationMatrix[2][0]*rotationMatrix[0][2] + rotationMatrix[0][0]*rotationMatrix[2][2];
+    rotated1[2] += quad1[2]*element;
+    rotated2[2] += quad2[2]*element;
+    element = rotationMatrix[0][1]*rotationMatrix[2][1] - rotationMatrix[0][2]*rotationMatrix[2][2];
+    rotated1[2] += quad1[3]*element;
+    rotated2[2] += quad2[3]*element;
+    element = rotationMatrix[2][1]*rotationMatrix[0][2] + rotationMatrix[0][1]*rotationMatrix[2][2];
+    rotated1[2] += quad1[4]*element;
+    rotated2[2] += quad2[4]*element;
+    element = 0.5f*sqrtThree*(rotationMatrix[1][0]*rotationMatrix[1][0] - rotationMatrix[2][0]*rotationMatrix[2][0]);
+    rotated1[3] += quad1[0]*element;
+    rotated2[3] += quad2[0]*element;
+    element = rotationMatrix[1][0]*rotationMatrix[1][1] - rotationMatrix[2][0]*rotationMatrix[2][1];
+    rotated1[3] += quad1[1]*element;
+    rotated2[3] += quad2[1]*element;
+    element = rotationMatrix[1][0]*rotationMatrix[1][2] - rotationMatrix[2][0]*rotationMatrix[2][2];
+    rotated1[3] += quad1[2]*element;
+    rotated2[3] += quad2[2]*element;
+    element = 0.5f*(rotationMatrix[1][1]*rotationMatrix[1][1] - rotationMatrix[2][1]*rotationMatrix[2][1] - rotationMatrix[1][2]*rotationMatrix[1][2] + rotationMatrix[2][2]*rotationMatrix[2][2]);
+    rotated1[3] += quad1[3]*element;
+    rotated2[3] += quad2[3]*element;
+    element = rotationMatrix[1][1]*rotationMatrix[1][2] - rotationMatrix[2][1]*rotationMatrix[2][2];
+    rotated1[3] += quad1[4]*element;
+    rotated2[3] += quad2[4]*element;
+    element = sqrtThree*rotationMatrix[1][0]*rotationMatrix[2][0];
+    rotated1[4] += quad1[0]*element;
+    rotated2[4] += quad2[0]*element;
+    element = rotationMatrix[2][0]*rotationMatrix[1][1] + rotationMatrix[1][0]*rotationMatrix[2][1];
+    rotated1[4] += quad1[1]*element;
+    rotated2[4] += quad2[1]*element;
+    element = rotationMatrix[2][0]*rotationMatrix[1][2] + rotationMatrix[1][0]*rotationMatrix[2][2];
+    rotated1[4] += quad1[2]*element;
+    rotated2[4] += quad2[2]*element;
+    element = rotationMatrix[1][1]*rotationMatrix[2][1] - rotationMatrix[1][2]*rotationMatrix[2][2];
+    rotated1[4] += quad1[3]*element;
+    rotated2[4] += quad2[3]*element;
+    element = rotationMatrix[2][1]*rotationMatrix[1][2] + rotationMatrix[1][1]*rotationMatrix[2][2];
+    rotated1[4] += quad1[4]*element;
+    rotated2[4] += quad2[4]*element;
+}
--- a/plugins/amoeba/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/amoeba/platforms/cuda/tests/CMakeLists.txt
@@ -17,7 +17,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    IF (APPLE)
        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ELSE (APPLE)
-        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ENDIF (APPLE)
    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
    IF (OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)

--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaGeneralizedKirkwoodForce.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaGeneralizedKirkwoodForce.cpp
@@ -6980,7 +6980,7 @@ static void compareForcesEnergy(std::string& testName, double expectedEnergy, do
                                const std::vector<Vec3>& expectedForces,
                                const std::vector<Vec3>& forces, double tolerance) {
    for (unsigned int ii = 0; ii < forces.size(); ii++) {
-        ASSERT_EQUAL_VEC_MOD(expectedForces[ii], forces[ii], tolerance, testName);
+        ASSERT_EQUAL_VEC(expectedForces[ii], forces[ii], tolerance);
    }
    ASSERT_EQUAL_TOL_MOD(expectedEnergy, energy, tolerance, testName);
 }

--- a/plugins/amoeba/platforms/reference/CMakeLists.txt
+++ b/plugins/amoeba/platforms/reference/CMakeLists.txt
@@ -71,7 +71,7 @@ ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${AP
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME})
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${SHARED_AMOEBA_TARGET})
 SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY")
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
 INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)

--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
@@ -317,6 +317,18 @@ void AmoebaReferenceMultipoleForce::loadParticleData(const vector<RealVec>& part
        particleData[ii].quadrupole[QYZ]      = quadrupoles[9*ii+5];
        particleData[ii].quadrupole[QZZ]      = quadrupoles[9*ii+8];
+        // Form spherical harmonic dipoles from Cartesian moments.
+        particleData[ii].sphericalDipole[0]  = dipoles[3*ii+2]; // z -> Q_10
+        particleData[ii].sphericalDipole[1]  = dipoles[3*ii+0]; // x -> Q_11c
+        particleData[ii].sphericalDipole[2]  = dipoles[3*ii+1]; // y -> Q_11s
+        // Form spherical harmonic quadrupoles from Cartesian moments.
+        particleData[ii].sphericalQuadrupole[0] = quadrupoles[9*ii+8]*3.0; // zz -> Q_20
+        particleData[ii].sphericalQuadrupole[1] = sqrtFourThirds * quadrupoles[9*ii+2]*3.0; // xz -> Q_21c
+        particleData[ii].sphericalQuadrupole[2] = sqrtFourThirds * quadrupoles[9*ii+5]*3.0; // yz -> Q_21s
+        particleData[ii].sphericalQuadrupole[3] = sqrtOneThird * (quadrupoles[9*ii+0] - quadrupoles[9*ii+4])*3.0; // xx-yy -> Q_22c
+        particleData[ii].sphericalQuadrupole[4] = sqrtFourThirds * quadrupoles[9*ii+1]*3.0; // xy -> Q_22s
        particleData[ii].thole                = tholes[ii];
        particleData[ii].dampingFactor        = dampingFactors[ii];
        particleData[ii].polarity             = polarity[ii];
@@ -335,10 +347,10 @@ void AmoebaReferenceMultipoleForce::checkChiralCenterAtParticle(MultipoleParticl
                                                                MultipoleParticleData& particleY) const 
 {
-    if (axisType == AmoebaMultipoleForce::ZThenX) {
+    if (axisType == AmoebaMultipoleForce::ZThenX || particleY.particleIndex == -1) {
        return;
    }
    RealVec deltaAD   = particleI.position - particleY.position;
    RealVec deltaBD   = particleZ.position - particleY.position;
    RealVec deltaCD   = particleX.position - particleY.position;
@@ -347,9 +359,12 @@ void AmoebaReferenceMultipoleForce::checkChiralCenterAtParticle(MultipoleParticl
    RealOpenMM volume = deltaC.dot(deltaAD);
    if (volume < 0.0) {
-        particleI.dipole[1]         *= -1.0; // pole(3,i)
+        particleI.dipole[1] *= -1.0; // pole(3,i)
-        particleI.quadrupole[QXY]   *= -1.0; // pole(6,i)  && pole(8,i)
+        particleI.quadrupole[QXY] *= -1.0; // pole(6,i) && pole(8,i)
-        particleI.quadrupole[QYZ]   *= -1.0; // pole(10,i) && pole(12,i)
+        particleI.quadrupole[QYZ] *= -1.0; // pole(10,i) && pole(12,i)
+        particleI.sphericalDipole[2]     *= -1.0;   // y
+        particleI.sphericalQuadrupole[2] *= -1.0;   // yz
+        particleI.sphericalQuadrupole[4] *= -1.0;   // xy
    }
 }
@@ -359,7 +374,6 @@ void AmoebaReferenceMultipoleForce::checkChiral(vector<MultipoleParticleData>& p
                                                const vector<int>& multipoleAtomZs,
                                                const vector<int>& axisTypes) const 
 {
    for (unsigned int ii = 0; ii < _numParticles; ii++) {
        if (multipoleAtomYs[ii] > -1) {
            checkChiralCenterAtParticle(particleData[ii], axisTypes[ii],
@@ -383,6 +397,7 @@ void AmoebaReferenceMultipoleForce::applyRotationMatrixToParticle(      Multipol
    // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
    // this atom and the axis atom
    RealVec vectorY;
    RealVec vectorZ = particleZ.position - particleI.position;
    RealVec vectorX = particleX.position - particleI.position;
@@ -491,6 +506,130 @@ void AmoebaReferenceMultipoleForce::applyRotationMatrixToParticle(      Multipol
    particleI.quadrupole[QYY] = rPole[1][1];
    particleI.quadrupole[QYZ] = rPole[1][2];
    particleI.quadrupole[QZZ] = rPole[2][2];
+    RealOpenMM dipoleRotationMatrix[3][3];
+    // Reorder the Cartesian {x,y,z} dipole rotation matrix, to account
+    // for spherical harmonic ordering {z,x,y}.
+    dipoleRotationMatrix[0][0] = vectorZ[2];
+    dipoleRotationMatrix[0][1] = vectorX[2];
+    dipoleRotationMatrix[0][2] = vectorY[2];
+    dipoleRotationMatrix[1][0] = vectorZ[0];
+    dipoleRotationMatrix[1][1] = vectorX[0];
+    dipoleRotationMatrix[1][2] = vectorY[0];
+    dipoleRotationMatrix[2][0] = vectorZ[1];
+    dipoleRotationMatrix[2][1] = vectorX[1];
+    dipoleRotationMatrix[2][2] = vectorY[1];
+    RealOpenMM quadrupoleRotationMatrix[5][5];
+    buildSphericalQuadrupoleRotationMatrix(dipoleRotationMatrix, quadrupoleRotationMatrix);
+    // Rotate the dipoles
+    RealOpenMM rotatedDipole[3];
+    for (int ii = 0; ii < 3; ii++) {
+        RealOpenMM val = 0.0;
+        for (int jj = 0; jj < 3; jj++) {
+            val += dipoleRotationMatrix[ii][jj] * particleI.sphericalDipole[jj];
+        }
+        rotatedDipole[ii] = val;
+    }
+    for (int ii = 0; ii < 3; ii++)
+        particleI.sphericalDipole[ii] = rotatedDipole[ii];
+    // Rotate the quadrupoles
+    RealOpenMM rotatedQuadrupole[5];
+    for (int ii = 0; ii < 5; ii++) {
+        RealOpenMM val = 0.0;
+        for (int jj = 0; jj < 5; jj++) {
+            val += quadrupoleRotationMatrix[ii][jj] * particleI.sphericalQuadrupole[jj];
+        }
+        rotatedQuadrupole[ii] = val;
+    }
+    for (int ii = 0; ii < 5; ii++)
+        particleI.sphericalQuadrupole[ii] = rotatedQuadrupole[ii];
+}
+void AmoebaReferenceMultipoleForce::formQIRotationMatrix(const RealVec& iPosition,
+                                                         const RealVec& jPosition,
+                                                         const RealVec &deltaR,
+                                                         RealOpenMM r,
+                                                         RealOpenMM (&rotationMatrix)[3][3]) const
+{
+    RealVec vectorZ = (deltaR)/r;
+    RealVec vectorX(vectorZ);
+    if ((iPosition[1] != jPosition[1]) || (iPosition[2] != jPosition[2])){
+        vectorX[0] += 1.0;
+    }else{
+        vectorX[1] += 1.0;
+    }
+    RealVec vectorY;
+    RealOpenMM dot      = vectorZ.dot(vectorX);
+    vectorX            -= vectorZ*dot;
+    normalizeRealVec(vectorX);
+    vectorY = vectorZ.cross(vectorX);
+    // Reorder the Cartesian {x,y,z} dipole rotation matrix, to account
+    // for spherical harmonic ordering {z,x,y}.
+    rotationMatrix[0][0] = vectorZ[2];
+    rotationMatrix[0][1] = vectorZ[0];
+    rotationMatrix[0][2] = vectorZ[1];
+    rotationMatrix[1][0] = vectorX[2];
+    rotationMatrix[1][1] = vectorX[0];
+    rotationMatrix[1][2] = vectorX[1];
+    rotationMatrix[2][0] = vectorY[2];
+    rotationMatrix[2][1] = vectorY[0];
+    rotationMatrix[2][2] = vectorY[1];
+}
+void AmoebaReferenceMultipoleForce::buildSphericalQuadrupoleRotationMatrix(const RealOpenMM (&D1)[3][3], RealOpenMM (&D2)[5][5]) const
+{
+    D2[0][0] = 0.5*(3.0*D1[0][0]*D1[0][0] - 1.0);
+    D2[1][0] = sqrtThree*D1[0][0]*D1[1][0];
+    D2[2][0] = sqrtThree*D1[0][0]*D1[2][0];
+    D2[3][0] = 0.5*sqrtThree*(D1[1][0]*D1[1][0] - D1[2][0]*D1[2][0]);
+    D2[4][0] = sqrtThree*D1[1][0]*D1[2][0];
+    D2[0][1] = sqrtThree*D1[0][0]*D1[0][1];
+    D2[1][1] = D1[1][0]*D1[0][1] + D1[0][0]*D1[1][1];
+    D2[2][1] = D1[2][0]*D1[0][1] + D1[0][0]*D1[2][1];
+    D2[3][1] = D1[1][0]*D1[1][1] - D1[2][0]*D1[2][1];
+    D2[4][1] = D1[2][0]*D1[1][1] + D1[1][0]*D1[2][1];
+    D2[0][2] = sqrtThree*D1[0][0]*D1[0][2];
+    D2[1][2] = D1[1][0]*D1[0][2] + D1[0][0]*D1[1][2];
+    D2[2][2] = D1[2][0]*D1[0][2] + D1[0][0]*D1[2][2];
+    D2[3][2] = D1[1][0]*D1[1][2] - D1[2][0]*D1[2][2];
+    D2[4][2] = D1[2][0]*D1[1][2] + D1[1][0]*D1[2][2];
+    D2[0][3] = 0.5*sqrtThree*(D1[0][1]*D1[0][1] - D1[0][2]*D1[0][2]);
+    D2[1][3] = D1[0][1]*D1[1][1] - D1[0][2]*D1[1][2];
+    D2[2][3] = D1[0][1]*D1[2][1] - D1[0][2]*D1[2][2];
+    D2[3][3] = 0.5*(D1[1][1]*D1[1][1] - D1[2][1]*D1[2][1] - D1[1][2]*D1[1][2] + D1[2][2]*D1[2][2]);
+    D2[4][3] = D1[1][1]*D1[2][1] - D1[1][2]*D1[2][2];
+    D2[0][4] = sqrtThree*D1[0][1]*D1[0][2];
+    D2[1][4] = D1[1][1]*D1[0][2] + D1[0][1]*D1[1][2];
+    D2[2][4] = D1[2][1]*D1[0][2] + D1[0][1]*D1[2][2];
+    D2[3][4] = D1[1][1]*D1[1][2] - D1[2][1]*D1[2][2];
+    D2[4][4] = D1[2][1]*D1[1][2] + D1[1][1]*D1[2][2];
+}
+void AmoebaReferenceMultipoleForce::buildPartialSphericalQuadrupoleRotationMatrix(const RealOpenMM (&D1)[3][3], RealOpenMM (&D2)[3][5]) const
+{
+    D2[0][0] = 0.5*(3.0*D1[0][0]*D1[0][0] - 1.0);
+    D2[0][1] = sqrtThree*D1[0][0]*D1[0][1];
+    D2[0][2] = sqrtThree*D1[0][0]*D1[0][2];
+    D2[0][3] = 0.5*sqrtThree*(D1[0][1]*D1[0][1] - D1[0][2]*D1[0][2]);
+    D2[0][4] = sqrtThree*D1[0][1]*D1[0][2];
+    D2[1][0] = sqrtThree*D1[0][0]*D1[1][0];
+    D2[1][1] = D1[1][0]*D1[0][1] + D1[0][0]*D1[1][1];
+    D2[1][2] = D1[1][0]*D1[0][2] + D1[0][0]*D1[1][2];
+    D2[1][3] = D1[0][1]*D1[1][1] - D1[0][2]*D1[1][2];
+    D2[1][4] = D1[1][1]*D1[0][2] + D1[0][1]*D1[1][2];
+    D2[2][0] = sqrtThree*D1[0][0]*D1[2][0];
+    D2[2][1] = D1[2][0]*D1[0][1] + D1[0][0]*D1[2][1];
+    D2[2][2] = D1[2][0]*D1[0][2] + D1[0][0]*D1[2][2];
+    D2[2][3] = D1[0][1]*D1[2][1] - D1[0][2]*D1[2][2];
+    D2[2][4] = D1[2][1]*D1[0][2] + D1[0][1]*D1[2][2];
 }
 void AmoebaReferenceMultipoleForce::applyRotationMatrix(vector<MultipoleParticleData>& particleData,
@@ -552,6 +691,7 @@ void AmoebaReferenceMultipoleForce::calculateFixedMultipoleFieldPairIxn(const Mu
    RealVec deltaR    = particleJ.position - particleI.position;
    RealOpenMM r      = SQRT(deltaR.dot(deltaR));
    vector<RealOpenMM> rrI(3);
    // get scaling factors, if needed
@@ -907,356 +1047,389 @@ RealOpenMM AmoebaReferenceMultipoleForce::calculateElectrostaticPairIxn(const Mu
                                                                        vector<RealVec>& forces,
                                                                        vector<RealVec>& torque) const 
 {
-    RealOpenMM temp3,temp5,temp7;
-    RealOpenMM gl[9],gli[7],glip[7];
-    RealOpenMM sc[10],sci[8],scip[8];
-    RealOpenMM gf[7],gfi[6],gti[6];
    unsigned int iIndex = particleI.particleIndex;
    unsigned int kIndex = particleK.particleIndex;
-    RealVec delta       = particleK.position - particleI.position;
+    RealVec deltaR = particleK.position - particleI.position;
-    RealOpenMM r2       = delta.dot(delta);
+    RealOpenMM r2  = deltaR.dot(deltaR);
+    RealOpenMM r   = SQRT(r2);
-    // set conversion factor, cutoff and switching coefficients
+    // Start by constructing rotation matrices to put dipoles and
-    RealOpenMM f        = _electric/_dielectric;
+    // quadrupoles into the QI frame, from the lab frame.
+    RealOpenMM qiRotationMatrix1[3][3];
-    // set scale factors for permanent multipole and induced terms
+    formQIRotationMatrix(particleI.position, particleK.position, deltaR, r, qiRotationMatrix1);
+    RealOpenMM qiRotationMatrix2[5][5];
-    RealOpenMM pdi      = particleI.dampingFactor;
+    buildSphericalQuadrupoleRotationMatrix(qiRotationMatrix1, qiRotationMatrix2);
-    RealOpenMM pti      = particleI.thole;
+    // The force rotation matrix rotates the QI forces into the lab
+    // frame, and makes sure the result is in {x,y,z} ordering. Its
-    // apply Thole polarization damping to scale factors
+    // transpose is used to rotate the induced dipoles to the QI frame.
+    RealOpenMM forceRotationMatrix[3][3];
-    RealOpenMM r        = SQRT(r2);
+    forceRotationMatrix[0][0] = qiRotationMatrix1[1][1];
-    RealOpenMM rr1      = 1.0/r;
+    forceRotationMatrix[0][1] = qiRotationMatrix1[2][1];
-    RealOpenMM rr3      = rr1/r2;
+    forceRotationMatrix[0][2] = qiRotationMatrix1[0][1];
-    RealOpenMM rr5      = 3.0*rr3/r2;
+    forceRotationMatrix[1][0] = qiRotationMatrix1[1][2];
-    RealOpenMM rr7      = 5.0*rr5/r2;
+    forceRotationMatrix[1][1] = qiRotationMatrix1[2][2];
-    RealOpenMM rr9      = 7.0*rr7/r2;
+    forceRotationMatrix[1][2] = qiRotationMatrix1[0][2];
-    RealOpenMM rr11     = 9.0*rr9/r2;
+    forceRotationMatrix[2][0] = qiRotationMatrix1[1][0];
+    forceRotationMatrix[2][1] = qiRotationMatrix1[2][0];
-    RealOpenMM scale3   = 1.0;
+    forceRotationMatrix[2][2] = qiRotationMatrix1[0][0];
-    RealOpenMM scale5   = 1.0;
+    // For efficiency, we go ahead and cache that transposed version
-    RealOpenMM scale7   = 1.0;
+    // now, because we need to do 4 rotations in total (I,J, and p,d).
+    // We also fold in the factor of 0.5 needed to average the p and d
-    RealVec ddsc3(0.0, 0.0, 0.0);
+    // components.
-    RealVec ddsc5(0.0, 0.0, 0.0);
+    RealOpenMM inducedDipoleRotationMatrix[3][3];
-    RealVec ddsc7(0.0, 0.0, 0.0);
+    inducedDipoleRotationMatrix[0][0] = 0.5*qiRotationMatrix1[0][1];
+    inducedDipoleRotationMatrix[0][1] = 0.5*qiRotationMatrix1[0][2];
-    RealOpenMM damp = particleI.dampingFactor*particleK.dampingFactor;
+    inducedDipoleRotationMatrix[0][2] = 0.5*qiRotationMatrix1[0][0];
-    if (damp != 0.0) {
+    inducedDipoleRotationMatrix[1][0] = 0.5*qiRotationMatrix1[1][1];
-        RealOpenMM pgamma = particleI.thole < particleK.thole ? particleI.thole : particleK.thole;
+    inducedDipoleRotationMatrix[1][1] = 0.5*qiRotationMatrix1[1][2];
-        RealOpenMM ratio  = (r/damp);
+    inducedDipoleRotationMatrix[1][2] = 0.5*qiRotationMatrix1[1][0];
-        damp              = -pgamma * ratio*ratio*ratio;
+    inducedDipoleRotationMatrix[2][0] = 0.5*qiRotationMatrix1[2][1];
-        if (damp > -50.0) {
+    inducedDipoleRotationMatrix[2][1] = 0.5*qiRotationMatrix1[2][2];
-            RealOpenMM expdamp = EXP(damp);
+    inducedDipoleRotationMatrix[2][2] = 0.5*qiRotationMatrix1[2][0];
-            scale3 = 1.0 - expdamp;
-            scale5 = 1.0 - (1.0-damp)*expdamp;
+    // Rotate the induced dipoles to the QI frame.
-            scale7 = 1.0 - (1.0-damp+0.6*damp*damp)*expdamp;
+    RealOpenMM qiUindI[3], qiUindJ[3], qiUinpI[3], qiUinpJ[3];
-            temp3 = -3.0 * damp * expdamp / r2;
+    for (int ii = 0; ii < 3; ii++) {
-            temp5 = -damp;
+        RealOpenMM valIP = 0.0;
-            temp7 = -0.2 - 0.6*damp;
+        RealOpenMM valID = 0.0;
+        RealOpenMM valJP = 0.0;
-            ddsc3 = delta*temp3;
+        RealOpenMM valJD = 0.0;
-            ddsc5 = ddsc3*temp5;
+        for (int jj = 0; jj < 3; jj++) {
-            ddsc7 = ddsc5*temp7;
+            valIP += inducedDipoleRotationMatrix[ii][jj] * _inducedDipolePolar[iIndex][jj];
+            valID += inducedDipoleRotationMatrix[ii][jj] * _inducedDipole[iIndex][jj];
+            valJP += inducedDipoleRotationMatrix[ii][jj] * _inducedDipolePolar[kIndex][jj];
+            valJD += inducedDipoleRotationMatrix[ii][jj] * _inducedDipole[kIndex][jj];
        }
+        qiUindI[ii] = valID;
+        qiUinpI[ii] = valIP;
+        qiUindJ[ii] = valJD;
+        qiUinpJ[ii] = valJP;
    }
-    RealOpenMM scale3i = scale3*scalingFactors[U_SCALE];
-    RealOpenMM scale5i = scale5*scalingFactors[U_SCALE];
-    RealOpenMM scale7i = scale7*scalingFactors[U_SCALE];
-    RealOpenMM dsc3    = scale3*scalingFactors[D_SCALE];
-    RealOpenMM dsc5    = scale5*scalingFactors[D_SCALE];
-    RealOpenMM dsc7    = scale7*scalingFactors[D_SCALE];
-    RealOpenMM psc3    = scale3*scalingFactors[P_SCALE];
-    RealOpenMM psc5    = scale5*scalingFactors[P_SCALE];
-    RealOpenMM psc7    = scale7*scalingFactors[P_SCALE];
-    // construct necessary auxiliary vectors
-    RealVec dixdk      = particleI.dipole.cross(particleK.dipole);
-    RealVec dixuk      = particleI.dipole.cross(_inducedDipole[kIndex]);
-    RealVec dkxui      = particleK.dipole.cross(_inducedDipole[iIndex]);
-    RealVec dixukp     = particleI.dipole.cross(_inducedDipolePolar[kIndex]);
-    RealVec dkxuip     = particleK.dipole.cross(_inducedDipolePolar[iIndex]);
-    RealVec dixr       = particleI.dipole.cross(delta);
-    RealVec dkxr       = particleK.dipole.cross(delta);
-    RealVec qir;
-    qir[0] = particleI.quadrupole[QXX]*delta[0] + particleI.quadrupole[QXY]*delta[1] + particleI.quadrupole[QXZ]*delta[2];
-    qir[1] = particleI.quadrupole[QXY]*delta[0] + particleI.quadrupole[QYY]*delta[1] + particleI.quadrupole[QYZ]*delta[2];
-    qir[2] = particleI.quadrupole[QXZ]*delta[0] + particleI.quadrupole[QYZ]*delta[1] + particleI.quadrupole[QZZ]*delta[2];
-    RealVec qkr;
-    qkr[0] = particleK.quadrupole[QXX]*delta[0] + particleK.quadrupole[QXY]*delta[1] + particleK.quadrupole[QXZ]*delta[2];
-    qkr[1] = particleK.quadrupole[QXY]*delta[0] + particleK.quadrupole[QYY]*delta[1] + particleK.quadrupole[QYZ]*delta[2];
-    qkr[2] = particleK.quadrupole[QXZ]*delta[0] + particleK.quadrupole[QYZ]*delta[1] + particleK.quadrupole[QZZ]*delta[2];
-    RealVec qiqkr;
-    qiqkr[0] = particleI.quadrupole[QXX]*qkr[0] + particleI.quadrupole[QXY]*qkr[1] + particleI.quadrupole[QXZ]*qkr[2];
-    qiqkr[1] = particleI.quadrupole[QXY]*qkr[0] + particleI.quadrupole[QYY]*qkr[1] + particleI.quadrupole[QYZ]*qkr[2];
-    qiqkr[2] = particleI.quadrupole[QXZ]*qkr[0] + particleI.quadrupole[QYZ]*qkr[1] + particleI.quadrupole[QZZ]*qkr[2];
-    RealVec qkqir;
-    qkqir[0] = particleK.quadrupole[QXX]*qir[0] + particleK.quadrupole[QXY]*qir[1] + particleK.quadrupole[QXZ]*qir[2];
-    qkqir[1] = particleK.quadrupole[QXY]*qir[0] + particleK.quadrupole[QYY]*qir[1] + particleK.quadrupole[QYZ]*qir[2];
-    qkqir[2] = particleK.quadrupole[QXZ]*qir[0] + particleK.quadrupole[QYZ]*qir[1] + particleK.quadrupole[QZZ]*qir[2];
-    RealVec qixqk;
-    qixqk[0] = particleI.quadrupole[QXY]*particleK.quadrupole[QXZ] +
-               particleI.quadrupole[QYY]*particleK.quadrupole[QYZ] +
-               particleI.quadrupole[QYZ]*particleK.quadrupole[QZZ] -
-               particleI.quadrupole[QXZ]*particleK.quadrupole[QXY] -
-               particleI.quadrupole[QYZ]*particleK.quadrupole[QYY] -
-               particleI.quadrupole[QZZ]*particleK.quadrupole[QYZ];
-    qixqk[1] = particleI.quadrupole[QXZ]*particleK.quadrupole[QXX] +
-               particleI.quadrupole[QYZ]*particleK.quadrupole[QXY] +
-               particleI.quadrupole[QZZ]*particleK.quadrupole[QXZ] -
-               particleI.quadrupole[QXX]*particleK.quadrupole[QXZ] -
-               particleI.quadrupole[QXY]*particleK.quadrupole[QYZ] -
-               particleI.quadrupole[QXZ]*particleK.quadrupole[QZZ];
-    qixqk[2] = particleI.quadrupole[QXX]*particleK.quadrupole[QXY] +
-               particleI.quadrupole[QXY]*particleK.quadrupole[QYY] +
-               particleI.quadrupole[QXZ]*particleK.quadrupole[QYZ] -
-               particleI.quadrupole[QXY]*particleK.quadrupole[QXX] -
-               particleI.quadrupole[QYY]*particleK.quadrupole[QXY] -
-               particleI.quadrupole[QYZ]*particleK.quadrupole[QXZ];
-    RealVec rxqir   = delta.cross(qir);
-    RealVec rxqkr   = delta.cross(qkr);
-    RealVec rxqikr  = delta.cross(qiqkr);
-    RealVec rxqkir  = delta.cross(qkqir);
-    RealVec qkrxqir = qkr.cross(qir);
-    RealVec qidk,qkdi;
-    qidk[0] = particleI.quadrupole[QXX]*particleK.dipole[0] + particleI.quadrupole[QXY]*particleK.dipole[1] + particleI.quadrupole[QXZ]*particleK.dipole[2];
-    qidk[1] = particleI.quadrupole[QXY]*particleK.dipole[0] + particleI.quadrupole[QYY]*particleK.dipole[1] + particleI.quadrupole[QYZ]*particleK.dipole[2];
-    qidk[2] = particleI.quadrupole[QXZ]*particleK.dipole[0] + particleI.quadrupole[QYZ]*particleK.dipole[1] + particleI.quadrupole[QZZ]*particleK.dipole[2];
-    qkdi[0] = particleK.quadrupole[QXX]*particleI.dipole[0] + particleK.quadrupole[QXY]*particleI.dipole[1] + particleK.quadrupole[QXZ]*particleI.dipole[2];
-    qkdi[1] = particleK.quadrupole[QXY]*particleI.dipole[0] + particleK.quadrupole[QYY]*particleI.dipole[1] + particleK.quadrupole[QYZ]*particleI.dipole[2];
-    qkdi[2] = particleK.quadrupole[QXZ]*particleI.dipole[0] + particleK.quadrupole[QYZ]*particleI.dipole[1] + particleK.quadrupole[QZZ]*particleI.dipole[2];
-    RealVec qiuk,qkui;
-    qiuk[0] = particleI.quadrupole[QXX]*_inducedDipole[kIndex][0] + particleI.quadrupole[QXY]*_inducedDipole[kIndex][1] + particleI.quadrupole[QXZ]*_inducedDipole[kIndex][2];
-    qiuk[1] = particleI.quadrupole[QXY]*_inducedDipole[kIndex][0] + particleI.quadrupole[QYY]*_inducedDipole[kIndex][1] + particleI.quadrupole[QYZ]*_inducedDipole[kIndex][2];
-    qiuk[2] = particleI.quadrupole[QXZ]*_inducedDipole[kIndex][0] + particleI.quadrupole[QYZ]*_inducedDipole[kIndex][1] + particleI.quadrupole[QZZ]*_inducedDipole[kIndex][2];
-    qkui[0] = particleK.quadrupole[QXX]*_inducedDipole[iIndex][0] + particleK.quadrupole[QXY]*_inducedDipole[iIndex][1] + particleK.quadrupole[QXZ]*_inducedDipole[iIndex][2];
-    qkui[1] = particleK.quadrupole[QXY]*_inducedDipole[iIndex][0] + particleK.quadrupole[QYY]*_inducedDipole[iIndex][1] + particleK.quadrupole[QYZ]*_inducedDipole[iIndex][2];
-    qkui[2] = particleK.quadrupole[QXZ]*_inducedDipole[iIndex][0] + particleK.quadrupole[QYZ]*_inducedDipole[iIndex][1] + particleK.quadrupole[QZZ]*_inducedDipole[iIndex][2];
-    RealVec qiukp,qkuip;
-    qiukp[0] = particleI.quadrupole[QXX]*_inducedDipolePolar[kIndex][0] + particleI.quadrupole[QXY]*_inducedDipolePolar[kIndex][1] + particleI.quadrupole[QXZ]*_inducedDipolePolar[kIndex][2];
-    qiukp[1] = particleI.quadrupole[QXY]*_inducedDipolePolar[kIndex][0] + particleI.quadrupole[QYY]*_inducedDipolePolar[kIndex][1] + particleI.quadrupole[QYZ]*_inducedDipolePolar[kIndex][2];
-    qiukp[2] = particleI.quadrupole[QXZ]*_inducedDipolePolar[kIndex][0] + particleI.quadrupole[QYZ]*_inducedDipolePolar[kIndex][1] + particleI.quadrupole[QZZ]*_inducedDipolePolar[kIndex][2];
-    qkuip[0] = particleK.quadrupole[QXX]*_inducedDipolePolar[iIndex][0] + particleK.quadrupole[QXY]*_inducedDipolePolar[iIndex][1] + particleK.quadrupole[QXZ]*_inducedDipolePolar[iIndex][2];
-    qkuip[1] = particleK.quadrupole[QXY]*_inducedDipolePolar[iIndex][0] + particleK.quadrupole[QYY]*_inducedDipolePolar[iIndex][1] + particleK.quadrupole[QYZ]*_inducedDipolePolar[iIndex][2];
-    qkuip[2] = particleK.quadrupole[QXZ]*_inducedDipolePolar[iIndex][0] + particleK.quadrupole[QYZ]*_inducedDipolePolar[iIndex][1] + particleK.quadrupole[QZZ]*_inducedDipolePolar[iIndex][2];
-    RealVec dixqkr   = particleI.dipole.cross(qkr);
-    RealVec dkxqir   = particleK.dipole.cross(qir);
-    RealVec uixqkr   = _inducedDipole[iIndex].cross(qkr);
-    RealVec ukxqir   = _inducedDipole[kIndex].cross(qir);
-    RealVec uixqkrp  = _inducedDipolePolar[iIndex].cross(qkr);
-    RealVec ukxqirp  = _inducedDipolePolar[kIndex].cross(qir);
-    RealVec rxqidk   = delta.cross(qidk);
-    RealVec rxqkdi   = delta.cross(qkdi);
-    RealVec rxqiuk   = delta.cross(qiuk);
-    RealVec rxqkui   = delta.cross(qkui);
-    RealVec rxqiukp  = delta.cross(qiukp);
-    RealVec rxqkuip  = delta.cross(qkuip);
-    // calculate scalar products for permanent components
-    sc[1] = particleI.dipole.dot(particleK.dipole);
-    sc[2] = particleI.dipole.dot(delta);
-    sc[3] = particleK.dipole.dot(delta);
-    sc[4] = qir.dot(delta);
-    sc[5] = qkr.dot(delta);
-    sc[6] = qir.dot(particleK.dipole);
-    sc[7] = qkr.dot(particleI.dipole);
-    sc[8] = qir.dot(qkr);
-    sc[9] = particleI.quadrupole[QXX]*particleK.quadrupole[QXX] + particleI.quadrupole[QXY]*particleK.quadrupole[QXY] + particleI.quadrupole[QXZ]*particleK.quadrupole[QXZ] +
-            particleI.quadrupole[QXY]*particleK.quadrupole[QXY] + particleI.quadrupole[QYY]*particleK.quadrupole[QYY] + particleI.quadrupole[QYZ]*particleK.quadrupole[QYZ] +
-            particleI.quadrupole[QXZ]*particleK.quadrupole[QXZ] + particleI.quadrupole[QYZ]*particleK.quadrupole[QYZ] + particleI.quadrupole[QZZ]*particleK.quadrupole[QZZ];
-    // calculate scalar products for induced components
-    sci[0] = _inducedDipole[iIndex][0]*particleK.dipole[0] + _inducedDipole[iIndex][1]*particleK.dipole[1] + _inducedDipole[iIndex][2]*particleK.dipole[2] +particleI.dipole[0]*_inducedDipole[kIndex][0] +particleI.dipole[1]*_inducedDipole[kIndex][1] +particleI.dipole[2]*_inducedDipole[kIndex][2];
-    sci[1] = _inducedDipole[iIndex][0]*_inducedDipole[kIndex][0] + _inducedDipole[iIndex][1]*_inducedDipole[kIndex][1] + _inducedDipole[iIndex][2]*_inducedDipole[kIndex][2];
-    sci[2] = _inducedDipole[iIndex][0]*delta[0] + _inducedDipole[iIndex][1]*delta[1] + _inducedDipole[iIndex][2]*delta[2];
-    sci[3] = _inducedDipole[kIndex][0]*delta[0] + _inducedDipole[kIndex][1]*delta[1] + _inducedDipole[kIndex][2]*delta[2];
-    sci[6] = qir[0]*_inducedDipole[kIndex][0] + qir[1]*_inducedDipole[kIndex][1] + qir[2]*_inducedDipole[kIndex][2];
-    sci[7] = qkr[0]*_inducedDipole[iIndex][0] + qkr[1]*_inducedDipole[iIndex][1] + qkr[2]*_inducedDipole[iIndex][2];
-    scip[0] = _inducedDipolePolar[iIndex][0]*particleK.dipole[0] + _inducedDipolePolar[iIndex][1]*particleK.dipole[1] + _inducedDipolePolar[iIndex][2]*particleK.dipole[2] +particleI.dipole[0]*_inducedDipolePolar[kIndex][0] +particleI.dipole[1]*_inducedDipolePolar[kIndex][1] +particleI.dipole[2]*_inducedDipolePolar[kIndex][2];
-    scip[1] = _inducedDipole[iIndex][0]*_inducedDipolePolar[kIndex][0]+_inducedDipole[iIndex][1]*_inducedDipolePolar[kIndex][1] + _inducedDipole[iIndex][2]*_inducedDipolePolar[kIndex][2]+_inducedDipolePolar[iIndex][0]*_inducedDipole[kIndex][0] + _inducedDipolePolar[iIndex][1]*_inducedDipole[kIndex][1]+_inducedDipolePolar[iIndex][2]*_inducedDipole[kIndex][2];
-    scip[2] = _inducedDipolePolar[iIndex][0]*delta[0] + _inducedDipolePolar[iIndex][1]*delta[1] + _inducedDipolePolar[iIndex][2]*delta[2];
-    scip[3] = _inducedDipolePolar[kIndex][0]*delta[0] + _inducedDipolePolar[kIndex][1]*delta[1] + _inducedDipolePolar[kIndex][2]*delta[2];
-    scip[6] = qir[0]*_inducedDipolePolar[kIndex][0] + qir[1]*_inducedDipolePolar[kIndex][1] + qir[2]*_inducedDipolePolar[kIndex][2];
-    scip[7] = qkr[0]*_inducedDipolePolar[iIndex][0] + qkr[1]*_inducedDipolePolar[iIndex][1] + qkr[2]*_inducedDipolePolar[iIndex][2];
-    // calculate the gl functions for permanent components
-    gl[0] = particleI.charge*particleK.charge;
-    gl[1] = particleK.charge*sc[2] - particleI.charge*sc[3];
-    gl[2] = particleI.charge*sc[5] + particleK.charge*sc[4] - sc[2]*sc[3];
-    gl[3] = sc[2]*sc[5] - sc[3]*sc[4];
-    gl[4] = sc[4]*sc[5];
-    gl[5] = -4.0 * sc[8];
-    gl[6] = sc[1];
-    gl[7] = 2.0 * (sc[6]-sc[7]);
-    gl[8] = 2.0 * sc[9];
-    // calculate the gl functions for induced components
-    gli[0] = particleK.charge*sci[2] - particleI.charge*sci[3];
-    gli[1] = -sc[2]*sci[3] - sci[2]*sc[3];
-    gli[2] = sci[2]*sc[5] - sci[3]*sc[4];
-    gli[5] = sci[0];
-    gli[6] = 2.0 * (sci[6]-sci[7]);
-    glip[0] = particleK.charge*scip[2] - particleI.charge*scip[3];
-    glip[1] = -sc[2]*scip[3] - scip[2]*sc[3];
-    glip[2] = scip[2]*sc[5] - scip[3]*sc[4];
-    glip[5] = scip[0];
-    glip[6] = 2.0 * (scip[6]-scip[7]);
-    // compute the energy contributions for this interaction
-    RealOpenMM energy = rr1*gl[0] + rr3*(gl[1]+gl[6]) + rr5*(gl[2]+gl[7]+gl[8]) + rr7*(gl[3]+gl[5]) + rr9*gl[4];
-    energy           *= scalingFactors[M_SCALE];
-    energy           += 0.5*(rr3*(gli[0]+gli[5])*psc3 + rr5*(gli[1]+gli[6])*psc5 + rr7*gli[2]*psc7);
-    energy           *= f;
-    // intermediate variables for the permanent components
-    gf[0] = rr3*gl[0] + rr5*(gl[1]+gl[6]) + rr7*(gl[2]+gl[7]+gl[8]) + rr9*(gl[3]+gl[5]) + rr11*gl[4];
-    gf[1] = -particleK.charge*rr3 + sc[3]*rr5 - sc[5]*rr7;
-    gf[2] =  particleI.charge*rr3 + sc[2]*rr5 + sc[4]*rr7;
-    gf[3] = 2.0*rr5;
-    gf[4] = 2.0*(-particleK.charge*rr5+sc[3]*rr7-sc[5]*rr9);
-    gf[5] = 2.0*(-particleI.charge*rr5-sc[2]*rr7-sc[4]*rr9);
-    gf[6] = 4.0*rr7;
-    // intermediate variables for the induced components
-    gfi[0] = 0.5 * rr5 * ((gli[0]+gli[5])*psc3 + (glip[0]+glip[5])*dsc3 + scip[1]*scale3i)
-           + 0.5 * rr7 * ((gli[6]+gli[1])*psc5 + (glip[6]+glip[1])*dsc5 - (sci[2]*scip[3]+scip[2]*sci[3])*scale5i)
-           + 0.5 * rr9 * (gli[2]*psc7+glip[2]*dsc7);
-    gfi[1] = -rr3*particleK.charge + rr5*sc[3] - rr7*sc[5];
-    gfi[2] =  rr3*particleI.charge + rr5*sc[2] + rr7*sc[4];
-    gfi[3] = 2.0*rr5;
-    gfi[4] = rr7*(sci[3]*psc7+scip[3]*dsc7);
-    gfi[5] = -rr7*(sci[2]*psc7+scip[2]*dsc7);
-    // get the permanent force components
-    RealVec ftm2 = delta*gf[0] + particleI.dipole*gf[1] + particleK.dipole*gf[2] + (qkdi -qidk)*gf[3] + qir*gf[4] +
-                   qkr*gf[5] + (qiqkr+qkqir)*gf[6];
-    // get the induced force components
-    RealVec ftm2i  = delta*gfi[0] + qir*gfi[4] + qkr*gfi[5];
-            ftm2i += (
-         (_inducedDipole[iIndex]*psc3 + _inducedDipolePolar[iIndex]*dsc3)*(-rr3*particleK.charge) +
-         (_inducedDipole[iIndex]*psc5 + _inducedDipolePolar[iIndex]*dsc5)*(rr5*sc[3]) +
-         (_inducedDipole[iIndex]*psc7 + _inducedDipolePolar[iIndex]*dsc7)*(-rr7*sc[5]) +
-         (_inducedDipole[kIndex]*psc3 + _inducedDipolePolar[kIndex]*dsc3)*(rr3*particleI.charge) +
-         (_inducedDipole[kIndex]*psc5 + _inducedDipolePolar[kIndex]*dsc5)*(rr5*sc[2]) +
-         (_inducedDipole[kIndex]*psc7 + _inducedDipolePolar[kIndex]*dsc7)*(rr7*sc[4]) +
-         (_inducedDipolePolar[iIndex]*sci[3] + _inducedDipole[iIndex]*scip[3] +
-          _inducedDipolePolar[kIndex]*sci[2] + _inducedDipole[kIndex]*scip[2])*(rr5*scale5i) +
-         particleI.dipole*((sci[3]*psc5  + scip[3]*dsc5)*rr5) +
-         particleK.dipole*((sci[2]*psc5  + scip[2]*dsc5)*rr5) +
-         ((qkui - qiuk)*psc5 + (qkuip - qiukp)*dsc5)*(gfi[3]))*0.5;
-    // account for partially excluded induced interactions
-    temp3 = rr3 * ((gli[0]+gli[5])*scalingFactors[P_SCALE] +(glip[0]+glip[5])*scalingFactors[D_SCALE]);
-    temp5 = rr5 * ((gli[1]+gli[6])*scalingFactors[P_SCALE] +(glip[1]+glip[6])*scalingFactors[D_SCALE]);
-    temp7 = rr7 * (gli[2]*scalingFactors[P_SCALE] +glip[2]*scalingFactors[D_SCALE]);
-    RealVec fridmp,findmp;
-    fridmp = (ddsc3*temp3 + ddsc5*temp5 + ddsc7*temp7);
-    // find some scaling terms for induced-induced force
-    temp3 =  rr3*scalingFactors[U_SCALE]*scip[1];
-    temp5 = -rr5*scalingFactors[U_SCALE]*(sci[2]*scip[3]+scip[2]*sci[3]);
-    findmp = (ddsc3*temp3 + ddsc5*temp5);
-    // modify induced force for partially excluded interactions
-    ftm2i -= (fridmp + findmp)*0.5;
-    // correction to convert mutual to direct polarization force
-    if (getPolarizationType() == AmoebaReferenceMultipoleForce::Direct) {
+    // The Qtilde intermediates (QI frame multipoles) for atoms I and J
-       RealOpenMM gfd   = (rr5*scip[1]*scale3i - rr7*(scip[2]*sci[3]+sci[2]*scip[3])*scale5i);
+    RealOpenMM qiQI[9], qiQJ[9];
-       temp5            = rr5*scale5i;
+    // Rotate the permanent multipoles to the QI frame.
+    qiQI[0] = particleI.charge;
-       RealVec fdir;
+    qiQJ[0] = particleK.charge;
-       fdir = delta*gfd + (_inducedDipolePolar[iIndex]*sci[3] +
+    for (int ii = 0; ii < 3; ii++) {
-                           _inducedDipole[iIndex]*scip[3] +
+        RealOpenMM valI = 0.0;
-                           _inducedDipolePolar[kIndex]*sci[2] +
+        RealOpenMM valJ = 0.0;
-                           _inducedDipole[kIndex]*scip[2])*temp5;
+        for (int jj = 0; jj < 3; jj++) {
+            valI += qiRotationMatrix1[ii][jj] * particleI.sphericalDipole[jj];
-       ftm2i += (findmp - fdir)*0.5;
+            valJ += qiRotationMatrix1[ii][jj] * particleK.sphericalDipole[jj];
+        }
+        qiQI[ii+1] = valI;
+        qiQJ[ii+1] = valJ;
+    }
+    for (int ii = 0; ii < 5; ii++) {
+        RealOpenMM valI = 0.0;
+        RealOpenMM valJ = 0.0;
+        for (int jj = 0; jj < 5; jj++) {
+            valI += qiRotationMatrix2[ii][jj] * particleI.sphericalQuadrupole[jj];
+            valJ += qiRotationMatrix2[ii][jj] * particleK.sphericalQuadrupole[jj];
+        }
+        qiQI[ii+4] = valI;
+        qiQJ[ii+4] = valJ;
    }
-    // intermediate terms for induced torque on multipoles
+    // The Qtilde{x,y,z} torque intermediates for atoms I and J, which are used to obtain the torques on the permanent moments.
+    RealOpenMM qiQIX[9] = {0.0, qiQI[3], 0.0, -qiQI[1], sqrtThree*qiQI[6], qiQI[8], -sqrtThree*qiQI[4] - qiQI[7], qiQI[6], -qiQI[5]};
-    gti[1] = 0.5*rr5*(sci[3]*psc5+scip[3]*dsc5);
+    RealOpenMM qiQIY[9] = {0.0, -qiQI[2], qiQI[1], 0.0, -sqrtThree*qiQI[5], sqrtThree*qiQI[4] - qiQI[7], -qiQI[8], qiQI[5], qiQI[6]};
-    gti[2] = 0.5*rr5*(sci[2]*psc5+scip[2]*dsc5);
+    RealOpenMM qiQIZ[9] = {0.0, 0.0, -qiQI[3], qiQI[2], 0.0, -qiQI[6], qiQI[5], -2.0*qiQI[8], 2.0*qiQI[7]};
-    gti[3] = gfi[3];
+    RealOpenMM qiQJX[9] = {0.0, qiQJ[3], 0.0, -qiQJ[1], sqrtThree*qiQJ[6], qiQJ[8], -sqrtThree*qiQJ[4] - qiQJ[7], qiQJ[6], -qiQJ[5]};
-    gti[4] = gfi[4];
+    RealOpenMM qiQJY[9] = {0.0, -qiQJ[2], qiQJ[1], 0.0, -sqrtThree*qiQJ[5], sqrtThree*qiQJ[4] - qiQJ[7], -qiQJ[8], qiQJ[5], qiQJ[6]};
-    gti[5] = gfi[5];
+    RealOpenMM qiQJZ[9] = {0.0, 0.0, -qiQJ[3], qiQJ[2], 0.0, -qiQJ[6], qiQJ[5], -2.0*qiQJ[8], 2.0*qiQJ[7]};
-    // get the permanent torque components
+    // The field derivatives at I due to permanent and induced moments on J, and vice-versa.
+    // Also, their derivatives w.r.t. R, which are needed for force calculations
-    RealVec ttm2  =  dixdk*(-rr3) + dixr*gf[1] - rxqir*gf[4] +
+    RealOpenMM Vij[9], Vji[9], VjiR[9], VijR[9];
-                     (dixqkr + dkxqir + rxqidk - qixqk*2.0)*gf[3] -
+    // The field derivatives at I due to only permanent moments on J, and vice-versa.
-                     (rxqikr + qkrxqir)*gf[6];
+    RealOpenMM Vijp[3], Vijd[3], Vjip[3], Vjid[3];
+    RealOpenMM rInvVec[7];
-    RealVec ttm3  =  dixdk*rr3 + dkxr*gf[2] - rxqkr*gf[5] -
-                     (dixqkr + dkxqir + rxqkdi - qixqk*2.0)*gf[3] -
+    RealOpenMM prefac = (_electric/_dielectric);
-                     (rxqkir - qkrxqir)*gf[6];
+    RealOpenMM rInv = 1.0 / r;
-    // get the induced torque components
+    // The rInvVec array is defined such that the ith element is R^-i, with the
+    // dieleectric constant folded in, to avoid conversions later.
-    RealVec ttm2i = (dixuk*psc3 + dixukp*dsc3)*(0.5*(-rr3)) +
+    rInvVec[1] = prefac * rInv;
-                     dixr*gti[1] + 
+    for(int i = 2; i < 7; ++i)
-                     ((ukxqir+rxqiuk)*psc5 + (ukxqirp+rxqiukp)*dsc5)*(0.5*gti[3]) - 
+        rInvVec[i] = rInvVec[i-1] * rInv;
-                     rxqir*gti[4];
+    RealOpenMM mScale = scalingFactors[M_SCALE];
-    RealVec ttm3i = (dkxui*psc3 + dkxuip*dsc3)*(0.5*(-rr3)) +
+    RealOpenMM dScale = scalingFactors[D_SCALE];
-                     dkxr*gti[2] - 
+    RealOpenMM pScale = scalingFactors[P_SCALE];
-                    ((uixqkr + rxqkui)*psc5 + (uixqkrp + rxqkuip)*dsc5)*(0.5*gti[3]) - 
+    RealOpenMM uScale = scalingFactors[U_SCALE];
-                      rxqkr*gti[5];
+    RealOpenMM dmp = particleI.dampingFactor*particleK.dampingFactor;
-    // increment forces and torques
+    RealOpenMM a = particleI.thole < particleK.thole ? particleI.thole : particleK.thole;
-    // remove factor of f from torques and add back in?
+    RealOpenMM u = std::abs(dmp) > 1.0E-5 ? r/dmp : 1E10;
+    RealOpenMM au3 = a*u*u*u;
-    RealVec force   = ftm2*scalingFactors[M_SCALE] + ftm2i;
+    RealOpenMM expau3 = au3 < 50.0 ? EXP(-au3) : 0.0;
-            force  *= f;
+    RealOpenMM a2u6 = au3*au3;
+    RealOpenMM a3u9 = a2u6*au3;
+    // Thole damping factors for energies
+    RealOpenMM thole_c  = 1.0 - expau3;
+    RealOpenMM thole_d0 = 1.0 - expau3*(1.0 + 1.5*au3);
+    RealOpenMM thole_d1 = 1.0 - expau3;
+    RealOpenMM thole_q0 = 1.0 - expau3*(1.0 + au3 + a2u6);
+    RealOpenMM thole_q1 = 1.0 - expau3*(1.0 + au3);
+    // Thole damping factors for derivatives
+    RealOpenMM dthole_c  = 1.0 - expau3*(1.0 + 1.5*au3);
+    RealOpenMM dthole_d0 = 1.0 - expau3*(1.0 + au3 + 1.5*a2u6);
+    RealOpenMM dthole_d1 = 1.0 - expau3*(1.0 + au3);
+    RealOpenMM dthole_q0 = 1.0 - expau3*(1.0 + au3 + 0.25*a2u6 + 0.75*a3u9);
+    RealOpenMM dthole_q1 = 1.0 - expau3*(1.0 + au3 + 0.75*a2u6);
+    // Now we compute the (attenuated) Coulomb operator and its derivatives, contracted with
+    // permanent moments and induced dipoles.  Note that the coefficient of the permanent force
+    // terms is half of the expected value; this is because we compute the interaction of I with
+    // the sum of induced and permanent moments on J, as well as the interaction of J with I's
+    // permanent and induced moments; doing so double counts the permanent-permanent interaction.
+    RealOpenMM ePermCoef, dPermCoef, eUIndCoef, dUIndCoef, eUInpCoef, dUInpCoef;
+    // C-C terms (m=0)
+    ePermCoef = rInvVec[1]*mScale;
+    dPermCoef = -0.5*mScale*rInvVec[2];
+    Vij[0]  = ePermCoef*qiQJ[0];
+    Vji[0]  = ePermCoef*qiQI[0];
+    VijR[0] = dPermCoef*qiQJ[0];
+    VjiR[0] = dPermCoef*qiQI[0];
+    // C-D and C-Uind terms (m=0)
+    ePermCoef = rInvVec[2]*mScale;
+    eUIndCoef = rInvVec[2]*pScale*thole_c;
+    eUInpCoef = rInvVec[2]*dScale*thole_c;
+    dPermCoef = -rInvVec[3]*mScale;
+    dUIndCoef = -2.0*rInvVec[3]*pScale*dthole_c;
+    dUInpCoef = -2.0*rInvVec[3]*dScale*dthole_c;
+    Vij[0]  += -(ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ[0] + eUInpCoef*qiUinpJ[0]);
+    Vji[1]   = -(ePermCoef*qiQI[0]);
+    VijR[0] += -(dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ[0] + dUInpCoef*qiUinpJ[0]);
+    VjiR[1]  = -(dPermCoef*qiQI[0]);
+    Vjip[0]  = -(eUInpCoef*qiQI[0]);
+    Vjid[0]  = -(eUIndCoef*qiQI[0]);
+    // D-C and Uind-C terms (m=0)
+    Vij[1]   = ePermCoef*qiQJ[0];
+    Vji[0]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI[0] + eUInpCoef*qiUinpI[0];
+    VijR[1]  = dPermCoef*qiQJ[0];
+    VjiR[0] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI[0] + dUInpCoef*qiUinpI[0];
+    Vijp[0]  = eUInpCoef*qiQJ[0];
+    Vijd[0]  = eUIndCoef*qiQJ[0];
+    // D-D and D-Uind terms (m=0)
+    ePermCoef = -2.0*rInvVec[3]*mScale;
+    eUIndCoef = -2.0*rInvVec[3]*pScale*thole_d0;
+    eUInpCoef = -2.0*rInvVec[3]*dScale*thole_d0;
+    dPermCoef = 3.0*rInvVec[4]*mScale;
+    dUIndCoef = 6.0*rInvVec[4]*pScale*dthole_d0;
+    dUInpCoef = 6.0*rInvVec[4]*dScale*dthole_d0;
+    Vij[1]  += ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ[0] + eUInpCoef*qiUinpJ[0];
+    Vji[1]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI[0] + eUInpCoef*qiUinpI[0];
+    VijR[1] += dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ[0] + dUInpCoef*qiUinpJ[0];
+    VjiR[1] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI[0] + dUInpCoef*qiUinpI[0];
+    Vijp[0] += eUInpCoef*qiQJ[1];
+    Vijd[0] += eUIndCoef*qiQJ[1];
+    Vjip[0] += eUInpCoef*qiQI[1];
+    Vjid[0] += eUIndCoef*qiQI[1];
+    // D-D and D-Uind terms (m=1)
+    ePermCoef = rInvVec[3]*mScale;
+    eUIndCoef = rInvVec[3]*pScale*thole_d1;
+    eUInpCoef = rInvVec[3]*dScale*thole_d1;
+    dPermCoef = -1.5*rInvVec[4]*mScale;
+    dUIndCoef = -3.0*rInvVec[4]*pScale*dthole_d1;
+    dUInpCoef = -3.0*rInvVec[4]*dScale*dthole_d1;
+    Vij[2]  = ePermCoef*qiQJ[2] + eUIndCoef*qiUindJ[1] + eUInpCoef*qiUinpJ[1];
+    Vji[2]  = ePermCoef*qiQI[2] + eUIndCoef*qiUindI[1] + eUInpCoef*qiUinpI[1];
+    VijR[2] = dPermCoef*qiQJ[2] + dUIndCoef*qiUindJ[1] + dUInpCoef*qiUinpJ[1];
+    VjiR[2] = dPermCoef*qiQI[2] + dUIndCoef*qiUindI[1] + dUInpCoef*qiUinpI[1];
+    Vij[3]  = ePermCoef*qiQJ[3] + eUIndCoef*qiUindJ[2] + eUInpCoef*qiUinpJ[2];
+    Vji[3]  = ePermCoef*qiQI[3] + eUIndCoef*qiUindI[2] + eUInpCoef*qiUinpI[2];
+    VijR[3] = dPermCoef*qiQJ[3] + dUIndCoef*qiUindJ[2] + dUInpCoef*qiUinpJ[2];
+    VjiR[3] = dPermCoef*qiQI[3] + dUIndCoef*qiUindI[2] + dUInpCoef*qiUinpI[2];
+    Vijp[1] = eUInpCoef*qiQJ[2];
+    Vijd[1] = eUIndCoef*qiQJ[2];
+    Vjip[1] = eUInpCoef*qiQI[2];
+    Vjid[1] = eUIndCoef*qiQI[2];
+    Vijp[2] = eUInpCoef*qiQJ[3];
+    Vijd[2] = eUIndCoef*qiQJ[3];
+    Vjip[2] = eUInpCoef*qiQI[3];
+    Vjid[2] = eUIndCoef*qiQI[3];
+    // C-Q terms (m=0)
+    ePermCoef = mScale*rInvVec[3];
+    dPermCoef = -1.5*rInvVec[4]*mScale;
+    Vij[0]  += ePermCoef*qiQJ[4];
+    Vji[4]   = ePermCoef*qiQI[0];
+    VijR[0] += dPermCoef*qiQJ[4];
+    VjiR[4]  = dPermCoef*qiQI[0];
+    // Q-C terms (m=0)
+    Vij[4]   = ePermCoef*qiQJ[0];
+    Vji[0]  += ePermCoef*qiQI[4];
+    VijR[4]  = dPermCoef*qiQJ[0];
+    VjiR[0] += dPermCoef*qiQI[4];
+    // D-Q and Uind-Q terms (m=0)
+    ePermCoef = rInvVec[4]*3.0*mScale;
+    eUIndCoef = rInvVec[4]*3.0*pScale*thole_q0;
+    eUInpCoef = rInvVec[4]*3.0*dScale*thole_q0;
+    dPermCoef = -6.0*rInvVec[5]*mScale;
+    dUIndCoef = -12.0*rInvVec[5]*pScale*dthole_q0;
+    dUInpCoef = -12.0*rInvVec[5]*dScale*dthole_q0;
+    Vij[1]  += ePermCoef*qiQJ[4];
+    Vji[4]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI[0] + eUInpCoef*qiUinpI[0];
+    VijR[1] += dPermCoef*qiQJ[4];
+    VjiR[4] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI[0] + dUInpCoef*qiUinpI[0];
+    Vijp[0] += eUInpCoef*qiQJ[4];
+    Vijd[0] += eUIndCoef*qiQJ[4];
+    // Q-D and Q-Uind terms (m=0)
+    Vij[4]  += -(ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ[0] + eUInpCoef*qiUinpJ[0]);
+    Vji[1]  += -(ePermCoef*qiQI[4]);
+    VijR[4] += -(dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ[0] + dUInpCoef*qiUinpJ[0]);
+    VjiR[1] += -(dPermCoef*qiQI[4]);
+    Vjip[0] += -(eUInpCoef*qiQI[4]);
+    Vjid[0] += -(eUIndCoef*qiQI[4]);
+    // D-Q and Uind-Q terms (m=1)
+    ePermCoef = -sqrtThree*rInvVec[4]*mScale;
+    eUIndCoef = -sqrtThree*rInvVec[4]*pScale*thole_q1;
+    eUInpCoef = -sqrtThree*rInvVec[4]*dScale*thole_q1;
+    dPermCoef = 2.0*sqrtThree*rInvVec[5]*mScale;
+    dUIndCoef = 4.0*sqrtThree*rInvVec[5]*pScale*dthole_q1;
+    dUInpCoef = 4.0*sqrtThree*rInvVec[5]*dScale*dthole_q1;
+    Vij[2]  += ePermCoef*qiQJ[5];
+    Vji[5]   = ePermCoef*qiQI[2] + eUIndCoef*qiUindI[1] + eUInpCoef*qiUinpI[1];
+    VijR[2] += dPermCoef*qiQJ[5];
+    VjiR[5]  = dPermCoef*qiQI[2] + dUIndCoef*qiUindI[1] + dUInpCoef*qiUinpI[1];
+    Vij[3]  += ePermCoef*qiQJ[6];
+    Vji[6]   = ePermCoef*qiQI[3] + eUIndCoef*qiUindI[2] + eUInpCoef*qiUinpI[2];
+    VijR[3] += dPermCoef*qiQJ[6];
+    VjiR[6]  = dPermCoef*qiQI[3] + dUIndCoef*qiUindI[2] + dUInpCoef*qiUinpI[2];
+    Vijp[1] += eUInpCoef*qiQJ[5];
+    Vijd[1] += eUIndCoef*qiQJ[5];
+    Vijp[2] += eUInpCoef*qiQJ[6];
+    Vijd[2] += eUIndCoef*qiQJ[6];
+    // D-Q and Uind-Q terms (m=1)
+    Vij[5]   = -(ePermCoef*qiQJ[2] + eUIndCoef*qiUindJ[1] + eUInpCoef*qiUinpJ[1]);
+    Vji[2]  += -(ePermCoef*qiQI[5]);
+    VijR[5]  = -(dPermCoef*qiQJ[2] + dUIndCoef*qiUindJ[1] + dUInpCoef*qiUinpJ[1]);
+    VjiR[2] += -(dPermCoef*qiQI[5]);
+    Vij[6]   = -(ePermCoef*qiQJ[3] + eUIndCoef*qiUindJ[2] + eUInpCoef*qiUinpJ[2]);
+    Vji[3]  += -(ePermCoef*qiQI[6]);
+    VijR[6]  = -(dPermCoef*qiQJ[3] + dUIndCoef*qiUindJ[2] + dUInpCoef*qiUinpJ[2]);
+    VjiR[3] += -(dPermCoef*qiQI[6]);
+    Vjip[1] += -(eUInpCoef*qiQI[5]);
+    Vjid[1] += -(eUIndCoef*qiQI[5]);
+    Vjip[2] += -(eUInpCoef*qiQI[6]);
+    Vjid[2] += -(eUIndCoef*qiQI[6]);
+    // Q-Q terms (m=0)
+    ePermCoef = 6.0*rInvVec[5]*mScale;
+    dPermCoef = -15.0*rInvVec[6]*mScale;
+    Vij[4]  += ePermCoef*qiQJ[4];
+    Vji[4]  += ePermCoef*qiQI[4];
+    VijR[4] += dPermCoef*qiQJ[4];
+    VjiR[4] += dPermCoef*qiQI[4];
+    // Q-Q terms (m=1)
+    ePermCoef = -4.0*rInvVec[5]*mScale;
+    dPermCoef = 10.0*rInvVec[6]*mScale;
+    Vij[5]  += ePermCoef*qiQJ[5];
+    Vji[5]  += ePermCoef*qiQI[5];
+    VijR[5] += dPermCoef*qiQJ[5];
+    VjiR[5] += dPermCoef*qiQI[5];
+    Vij[6]  += ePermCoef*qiQJ[6];
+    Vji[6]  += ePermCoef*qiQI[6];
+    VijR[6] += dPermCoef*qiQJ[6];
+    VjiR[6] += dPermCoef*qiQI[6];
+    // Q-Q terms (m=2)
+    ePermCoef = rInvVec[5]*mScale;
+    dPermCoef = -2.5*rInvVec[6]*mScale;
+    Vij[7]  = ePermCoef*qiQJ[7];
+    Vji[7]  = ePermCoef*qiQI[7];
+    VijR[7] = dPermCoef*qiQJ[7];
+    VjiR[7] = dPermCoef*qiQI[7];
+    Vij[8]  = ePermCoef*qiQJ[8];
+    Vji[8]  = ePermCoef*qiQI[8];
+    VijR[8] = dPermCoef*qiQJ[8];
+    VjiR[8] = dPermCoef*qiQI[8];
+    // Evaluate the energies, forces and torques due to permanent+induced moments
+    // interacting with just the permanent moments.
+    RealOpenMM energy = 0.5*(qiQI[0]*Vij[0] + qiQJ[0]*Vji[0]);
+    RealOpenMM fIZ = qiQI[0]*VijR[0];
+    RealOpenMM fJZ = qiQJ[0]*VjiR[0];
+    RealOpenMM EIX = 0.0, EIY = 0.0, EIZ = 0.0, EJX = 0.0, EJY = 0.0, EJZ = 0.0;
+    for(int i = 1; i < 9; ++i){
+        energy += 0.5*(qiQI[i]*Vij[i] + qiQJ[i]*Vji[i]);
+        fIZ += qiQI[i]*VijR[i];
+        fJZ += qiQJ[i]*VjiR[i];
+        EIX += qiQIX[i]*Vij[i];
+        EIY += qiQIY[i]*Vij[i];
+        EIZ += qiQIZ[i]*Vij[i];
+        EJX += qiQJX[i]*Vji[i];
+        EJY += qiQJY[i]*Vji[i];
+        EJZ += qiQJZ[i]*Vji[i];
+    }
-    forces[iIndex] -= force;
+    // Define the torque intermediates for the induced dipoles. These are simply the induced dipole torque
-    forces[kIndex] += force;
+    // intermediates dotted with the field due to permanent moments only, at each center. We inline the
+    // induced dipole torque intermediates here, for simplicity. N.B. There are no torques on the dipoles
+    // themselves, so we accumulate the torque intermediates into separate variables to allow them to be
+    // used only in the force calculation.
+    //
+    // The torque about the x axis (needed to obtain the y force on the induced dipoles, below)
+    //    qiUindIx[0] = qiQUindI[2];    qiUindIx[1] = 0;    qiUindIx[2] = -qiQUindI[0]
+    RealOpenMM iEIX = qiUinpI[2]*Vijp[0] + qiUindI[2]*Vijd[0] - qiUinpI[0]*Vijp[2] - qiUindI[0]*Vijd[2];
+    RealOpenMM iEJX = qiUinpJ[2]*Vjip[0] + qiUindJ[2]*Vjid[0] - qiUinpJ[0]*Vjip[2] - qiUindJ[0]*Vjid[2];
+    // The torque about the y axis (needed to obtain the x force on the induced dipoles, below)
+    //    qiUindIy[0] = -qiQUindI[1];   qiUindIy[1] = qiQUindI[0];    qiUindIy[2] = 0
+    RealOpenMM iEIY = qiUinpI[0]*Vijp[1] + qiUindI[0]*Vijd[1] - qiUinpI[1]*Vijp[0] - qiUindI[1]*Vijd[0];
+    RealOpenMM iEJY = qiUinpJ[0]*Vjip[1] + qiUindJ[0]*Vjid[1] - qiUinpJ[1]*Vjip[0] - qiUindJ[1]*Vjid[0];
+    // Add in the induced-induced terms, if needed.
+    if(getPolarizationType() == AmoebaReferenceMultipoleForce::Mutual){
+        // Uind-Uind terms (m=0)
+        RealOpenMM eCoef = -4.0*rInvVec[3]*uScale*thole_d0;
+        RealOpenMM dCoef = 6.0*rInvVec[4]*uScale*dthole_d0;
+        iEIX += eCoef*(qiUinpI[2]*qiUindJ[0] + qiUindI[2]*qiUinpJ[0]);
+        iEJX += eCoef*(qiUinpJ[2]*qiUindI[0] + qiUindJ[2]*qiUinpI[0]);
+        iEIY -= eCoef*(qiUinpI[1]*qiUindJ[0] + qiUindI[1]*qiUinpJ[0]);
+        iEJY -= eCoef*(qiUinpJ[1]*qiUindI[0] + qiUindJ[1]*qiUinpI[0]);
+        fIZ += dCoef*(qiUinpI[0]*qiUindJ[0] + qiUindI[0]*qiUinpJ[0]);
+        fIZ += dCoef*(qiUinpJ[0]*qiUindI[0] + qiUindJ[0]*qiUinpI[0]);
+        // Uind-Uind terms (m=1)
+        eCoef = 2.0*rInvVec[3]*uScale*thole_d1;
+        dCoef = -3.0*rInvVec[4]*uScale*dthole_d1;
+        iEIX -= eCoef*(qiUinpI[0]*qiUindJ[2] + qiUindI[0]*qiUinpJ[2]);
+        iEJX -= eCoef*(qiUinpJ[0]*qiUindI[2] + qiUindJ[0]*qiUinpI[2]);
+        iEIY += eCoef*(qiUinpI[0]*qiUindJ[1] + qiUindI[0]*qiUinpJ[1]);
+        iEJY += eCoef*(qiUinpJ[0]*qiUindI[1] + qiUindJ[0]*qiUinpI[1]);
+        fIZ += dCoef*(qiUinpI[1]*qiUindJ[1] + qiUindI[1]*qiUinpJ[1] + qiUinpI[2]*qiUindJ[2] + qiUindI[2]*qiUinpJ[2]);
+        fIZ += dCoef*(qiUinpJ[1]*qiUindI[1] + qiUindJ[1]*qiUinpI[1] + qiUinpJ[2]*qiUindI[2] + qiUindJ[2]*qiUinpI[2]);
+    }
-    torque[iIndex] += (ttm2*scalingFactors[M_SCALE] + ttm2i)*f;
+    // The quasi-internal frame forces and torques.  Note that the induced torque intermediates are
-    torque[kIndex] += (ttm3*scalingFactors[M_SCALE] + ttm3i)*f;
+    // used in the force expression, but not in the torques; the induced dipoles are isotropic.
+    RealOpenMM qiForce[3] = {rInv*(EIY+EJY+iEIY+iEJY), -rInv*(EIX+EJX+iEIX+iEJX), -(fJZ+fIZ)};
+    RealOpenMM qiTorqueI[3] = {-EIX, -EIY, -EIZ};
+    RealOpenMM qiTorqueJ[3] = {-EJX, -EJY, -EJZ};
+    // Rotate the forces and torques back to the lab frame
+    for (int ii = 0; ii < 3; ii++) {
+        RealOpenMM forceVal = 0.0;
+        RealOpenMM torqueIVal = 0.0;
+        RealOpenMM torqueJVal = 0.0;
+        for (int jj = 0; jj < 3; jj++) {
+            forceVal   += forceRotationMatrix[ii][jj] * qiForce[jj];
+            torqueIVal += forceRotationMatrix[ii][jj] * qiTorqueI[jj];
+            torqueJVal += forceRotationMatrix[ii][jj] * qiTorqueJ[jj];
+        }
+        torque[iIndex][ii] += torqueIVal;
+        torque[kIndex][ii] += torqueJVal;
+        forces[iIndex][ii] -= forceVal;
+        forces[kIndex][ii] += forceVal;
+    }
    return energy;
 }
@@ -1755,10 +1928,9 @@ RealOpenMM AmoebaReferenceMultipoleForce::calculateElectrostaticPotentialForPart
    RealOpenMM r             = SQRT(r2);
    RealOpenMM rr1           = 1.0/r;
-    RealOpenMM potential     = particleI.charge*rr1;
    RealOpenMM rr2           = rr1*rr1;
    RealOpenMM rr3           = rr1*rr2;
+    RealOpenMM potential     = particleI.charge*rr1;
    RealOpenMM scd           = particleI.dipole.dot(deltaR);
    RealOpenMM scu           = _inducedDipole[particleI.particleIndex].dot(deltaR);
@@ -1769,9 +1941,7 @@ RealOpenMM AmoebaReferenceMultipoleForce::calculateElectrostaticPotentialForPart
          scq               += deltaR[1]*(particleI.quadrupole[QXY]*deltaR[0] + particleI.quadrupole[QYY]*deltaR[1] + particleI.quadrupole[QYZ]*deltaR[2]);
          scq               += deltaR[2]*(particleI.quadrupole[QXZ]*deltaR[0] + particleI.quadrupole[QYZ]*deltaR[1] + particleI.quadrupole[QZZ]*deltaR[2]);
    potential               += scq*rr5;
    return potential;
 }
 void AmoebaReferenceMultipoleForce::calculateElectrostaticPotential(const vector<RealVec>& particlePositions,
@@ -4741,6 +4911,9 @@ void AmoebaReferencePmeMultipoleForce::calculateFixedMultipoleFieldPairIxn(const
                                                                           RealOpenMM dscale, RealOpenMM pscale)
 {
+    unsigned int iIndex    = particleI.particleIndex;
+    unsigned int jIndex    = particleJ.particleIndex;
    // compute the real space portion of the Ewald summation
    if (particleI.particleIndex == particleJ.particleIndex)
@@ -4815,8 +4988,6 @@ void AmoebaReferencePmeMultipoleForce::calculateFixedMultipoleFieldPairIxn(const
    // increment the field at each site due to this interaction
-    unsigned int iIndex    = particleI.particleIndex;
-    unsigned int jIndex    = particleJ.particleIndex;
    _fixedMultipoleField[iIndex]      += fim - fid;
    _fixedMultipoleField[jIndex]      += fjm - fjd;
@@ -4980,9 +5151,9 @@ void AmoebaReferencePmeMultipoleForce::transformMultipolesToFractionalCoordinate
                b[i][j] += a[index1[i]][index2[j]]*a[index2[i]][index1[j]];
        }
    }
    // Transform the multipoles.
    _transformed.resize(particleData.size());
    double quadScale[] = {1, 2, 2, 1, 2, 1};
    for (int i = 0; i < (int) particleData.size(); i++) {
@@ -5854,21 +6025,21 @@ RealOpenMM AmoebaReferencePmeMultipoleForce::calculatePmeSelfEnergy(const vector
        const MultipoleParticleData& particleI = particleData[ii];
-        cii      +=  particleI.charge*particleI.charge;
+        cii += particleI.charge*particleI.charge;
-        dii      +=  particleI.dipole.dot(particleI.dipole + _inducedDipole[ii]) ;
-        qii      +=  particleI.quadrupole[QXX]*particleI.quadrupole[QXX] +
-                     particleI.quadrupole[QYY]*particleI.quadrupole[QYY] +
-                     particleI.quadrupole[QZZ]*particleI.quadrupole[QZZ] +
-                    (particleI.quadrupole[QXY]*particleI.quadrupole[QXY] +
-                     particleI.quadrupole[QXZ]*particleI.quadrupole[QXZ] +
-                     particleI.quadrupole[QYZ]*particleI.quadrupole[QYZ])*2.0;
-    }
-    RealOpenMM term      = 2.0*_alphaEwald*_alphaEwald;
+        RealVec dipole(particleI.sphericalDipole[1], particleI.sphericalDipole[2], particleI.sphericalDipole[0]);
-    RealOpenMM energy    = (cii + term*(dii/3.0 + 2.0*term*qii/5.0));
+        dii += dipole.dot(dipole + _inducedDipole[ii]);
-               energy   *= -(_electric*_alphaEwald/(_dielectric*SQRT_PI));
+        qii += (particleI.sphericalQuadrupole[0]*particleI.sphericalQuadrupole[0]
+               +particleI.sphericalQuadrupole[1]*particleI.sphericalQuadrupole[1]
+               +particleI.sphericalQuadrupole[2]*particleI.sphericalQuadrupole[2]
+               +particleI.sphericalQuadrupole[3]*particleI.sphericalQuadrupole[3]
+               +particleI.sphericalQuadrupole[4]*particleI.sphericalQuadrupole[4]);
+    }
+    RealOpenMM prefac = -_alphaEwald * _electric / (_dielectric*SQRT_PI);
+    RealOpenMM a2 = _alphaEwald * _alphaEwald;
+    RealOpenMM a4 = a2*a2;
+    RealOpenMM energy = prefac*(cii + twoThirds*a2*dii + fourOverFifteen*a4*qii);
    return energy;
 }
@@ -5876,12 +6047,12 @@ void AmoebaReferencePmeMultipoleForce::calculatePmeSelfTorque(const vector<Multi
                                                              vector<RealVec>& torques) const 
 {
    RealOpenMM term = (2.0/3.0)*(_electric/_dielectric)*(_alphaEwald*_alphaEwald*_alphaEwald)/SQRT_PI;
    for (unsigned int ii = 0; ii < _numParticles; ii++) {
        const MultipoleParticleData& particleI = particleData[ii];
        RealVec ui                             = (_inducedDipole[ii] + _inducedDipolePolar[ii]);
-        RealVec torque                         = particleI.dipole.cross(ui);
+        RealVec dipole(particleI.sphericalDipole[1], particleI.sphericalDipole[2], particleI.sphericalDipole[0]);
+        RealVec torque                         = dipole.cross(ui);
                torque                        *= term;
       torques[ii]                            += torque;
    }
@@ -5904,681 +6075,402 @@ RealOpenMM AmoebaReferencePmeMultipoleForce::calculatePmeDirectElectrostaticPair
    if (r2 > _cutoffDistanceSquared)return 0.0;
-    RealOpenMM xr    = deltaR[0];
-    RealOpenMM yr    = deltaR[1];
-    RealOpenMM zr    = deltaR[2];
    RealOpenMM r      = SQRT(r2);
-    RealOpenMM ck     = particleJ.charge;
-    // set the permanent multipole and induced dipole values;
-    RealOpenMM ci    = particleI.charge;
-    RealOpenMM di1   = particleI.dipole[0];
-    RealOpenMM di2   = particleI.dipole[1];
-    RealOpenMM di3   = particleI.dipole[2];
-    RealOpenMM qi1   = particleI.quadrupole[QXX];
-    RealOpenMM qi2   = particleI.quadrupole[QXY];
-    RealOpenMM qi3   = particleI.quadrupole[QXZ];
-    RealOpenMM qi5   = particleI.quadrupole[QYY];
-    RealOpenMM qi6   = particleI.quadrupole[QYZ];
-    RealOpenMM qi9   = -(particleI.quadrupole[QXX] + particleI.quadrupole[QYY]);
-    RealOpenMM dk1  = particleJ.dipole[0];
-    RealOpenMM dk2  = particleJ.dipole[1];
-    RealOpenMM dk3  = particleJ.dipole[2];
-    RealOpenMM qk1   = particleJ.quadrupole[QXX];
-    RealOpenMM qk2   = particleJ.quadrupole[QXY];
-    RealOpenMM qk3   = particleJ.quadrupole[QXZ];
-    RealOpenMM qk5   = particleJ.quadrupole[QYY];
-    RealOpenMM qk6   = particleJ.quadrupole[QYZ];
-    RealOpenMM qk9   = -(particleJ.quadrupole[QXX] + particleJ.quadrupole[QYY]);
-    // calculate the real space error function terms
+    // Start by constructing rotation matrices to put dipoles and
+    // quadrupoles into the QI frame, from the lab frame.
-    RealOpenMM ralpha = _alphaEwald*r;
+    RealOpenMM qiRotationMatrix1[3][3];
-    RealOpenMM bn0    = erfc(ralpha)/r;
+    formQIRotationMatrix(particleI.position, particleJ.position, deltaR, r, qiRotationMatrix1);
+    RealOpenMM qiRotationMatrix2[5][5];
-    RealOpenMM alsq2  = 2.0*_alphaEwald*_alphaEwald;
+    buildSphericalQuadrupoleRotationMatrix(qiRotationMatrix1, qiRotationMatrix2);
-    RealOpenMM alsq2n = 0.0;
+    // The force rotation matrix rotates the QI forces into the lab
-    if (_alphaEwald > 0.0) {
+    // frame, and makes sure the result is in {x,y,z} ordering. Its
-        alsq2n = 1.0/(SQRT_PI*_alphaEwald);
+    // transpose is used to rotate the induced dipoles to the QI frame.
+    RealOpenMM forceRotationMatrix[3][3];
+    forceRotationMatrix[0][0] = qiRotationMatrix1[1][1];
+    forceRotationMatrix[0][1] = qiRotationMatrix1[2][1];
+    forceRotationMatrix[0][2] = qiRotationMatrix1[0][1];
+    forceRotationMatrix[1][0] = qiRotationMatrix1[1][2];
+    forceRotationMatrix[1][1] = qiRotationMatrix1[2][2];
+    forceRotationMatrix[1][2] = qiRotationMatrix1[0][2];
+    forceRotationMatrix[2][0] = qiRotationMatrix1[1][0];
+    forceRotationMatrix[2][1] = qiRotationMatrix1[2][0];
+    forceRotationMatrix[2][2] = qiRotationMatrix1[0][0];
+    // For efficiency, we go ahead and cache that transposed version
+    // now, because we need to do 4 rotations in total (I,J, and p,d).
+    // We also fold in the factor of 0.5 needed to average the p and d
+    // components.
+    RealOpenMM inducedDipoleRotationMatrix[3][3];
+    inducedDipoleRotationMatrix[0][0] = 0.5*qiRotationMatrix1[0][1];
+    inducedDipoleRotationMatrix[0][1] = 0.5*qiRotationMatrix1[0][2];
+    inducedDipoleRotationMatrix[0][2] = 0.5*qiRotationMatrix1[0][0];
+    inducedDipoleRotationMatrix[1][0] = 0.5*qiRotationMatrix1[1][1];
+    inducedDipoleRotationMatrix[1][1] = 0.5*qiRotationMatrix1[1][2];
+    inducedDipoleRotationMatrix[1][2] = 0.5*qiRotationMatrix1[1][0];
+    inducedDipoleRotationMatrix[2][0] = 0.5*qiRotationMatrix1[2][1];
+    inducedDipoleRotationMatrix[2][1] = 0.5*qiRotationMatrix1[2][2];
+    inducedDipoleRotationMatrix[2][2] = 0.5*qiRotationMatrix1[2][0];
+    // Rotate the induced dipoles to the QI frame.
+    RealOpenMM qiUindI[3], qiUindJ[3], qiUinpI[3], qiUinpJ[3];
+    for (int ii = 0; ii < 3; ii++) {
+        RealOpenMM valIP = 0.0;
+        RealOpenMM valID = 0.0;
+        RealOpenMM valJP = 0.0;
+        RealOpenMM valJD = 0.0;
+        for (int jj = 0; jj < 3; jj++) {
+            valIP += inducedDipoleRotationMatrix[ii][jj] * _inducedDipolePolar[iIndex][jj];
+            valID += inducedDipoleRotationMatrix[ii][jj] * _inducedDipole[iIndex][jj];
+            valJP += inducedDipoleRotationMatrix[ii][jj] * _inducedDipolePolar[jIndex][jj];
+            valJD += inducedDipoleRotationMatrix[ii][jj] * _inducedDipole[jIndex][jj];
+        }
+        qiUindI[ii] = valID;
+        qiUinpI[ii] = valIP;
+        qiUindJ[ii] = valJD;
+        qiUinpJ[ii] = valJP;
    }
-    RealOpenMM exp2a  = EXP(-(ralpha*ralpha));
-    alsq2n           *= alsq2;
-    RealOpenMM bn1    = (bn0+alsq2n*exp2a)/r2;
-    alsq2n           *= alsq2;
-    RealOpenMM bn2    = (3.0*bn1+alsq2n*exp2a)/r2;
-    alsq2n      *= alsq2;
-    RealOpenMM bn3    = (5.0*bn2+alsq2n*exp2a)/r2;
-    alsq2n      *= alsq2;
-    RealOpenMM bn4    = (7.0*bn3+alsq2n*exp2a)/r2;
-    alsq2n      *= alsq2;
-    RealOpenMM bn5    = (9.0*bn4+alsq2n*exp2a)/r2;
-    // apply Thole polarization damping to scale factors
+    // The Qtilde intermediates (QI frame multipoles) for atoms I and J
+    RealOpenMM qiQI[9], qiQJ[9];
-    RealOpenMM rr1    = 1.0/r;
+    // Rotate the permanent multipoles to the QI frame.
-    RealOpenMM rr3    = rr1/r2;
+    qiQI[0] = particleI.charge;
-    RealOpenMM rr5    = 3.0*rr3/r2;
+    qiQJ[0] = particleJ.charge;
-    RealOpenMM rr7    = 5.0*rr5/r2;
+    for (int ii = 0; ii < 3; ii++) {
-    RealOpenMM rr9    = 7.0*rr7/r2;
+        RealOpenMM valI = 0.0;
-    RealOpenMM rr11   = 9.0*rr9/r2;
+        RealOpenMM valJ = 0.0;
+        for (int jj = 0; jj < 3; jj++) {
-    RealOpenMM scale3 = 1.0;
+            valI += qiRotationMatrix1[ii][jj] * particleI.sphericalDipole[jj];
-    RealOpenMM scale5 = 1.0;
+            valJ += qiRotationMatrix1[ii][jj] * particleJ.sphericalDipole[jj];
-    RealOpenMM scale7 = 1.0;
-    RealOpenMM ddsc31 = 0.0;
-    RealOpenMM ddsc32 = 0.0;
-    RealOpenMM ddsc33 = 0.0;
-    RealOpenMM ddsc51 = 0.0;
-    RealOpenMM ddsc52 = 0.0;
-    RealOpenMM ddsc53 = 0.0;
-    RealOpenMM ddsc71 = 0.0;
-    RealOpenMM ddsc72 = 0.0;
-    RealOpenMM ddsc73 = 0.0;
-    RealOpenMM damp   = particleI.dampingFactor*particleJ.dampingFactor;
-    if (damp != 0.0) {
-        RealOpenMM pgamma = particleI.thole < particleJ.thole ? particleI.thole : particleJ.thole;
-        RealOpenMM ratio  = r/damp;
-            damp     = -pgamma*ratio*ratio*ratio;
-        if (damp > -50.0) {
-            RealOpenMM expdamp  = EXP(damp);
-            scale3         = 1.0 - expdamp;
-               scale5      = 1.0 - (1.0-damp)*expdamp;
-               scale7      = 1.0 - (1.0-damp+0.6*damp*damp)*expdamp;
-            RealOpenMM temp3    = -3.0 * damp * expdamp / r2;
-            RealOpenMM temp5    = -damp;
-            RealOpenMM temp7    = -0.2 - 0.6*damp;
-            ddsc31         = temp3 * xr;
-            ddsc32         = temp3 * yr;
-            ddsc33         = temp3 * zr;
-            ddsc51         = temp5 * ddsc31;
-            ddsc52         = temp5 * ddsc32;
-            ddsc53         = temp5 * ddsc33;
-            ddsc71         = temp7 * ddsc51;
-            ddsc72         = temp7 * ddsc52;
-            ddsc73         = temp7 * ddsc53;
        }
+        qiQI[ii+1] = valI;
+        qiQJ[ii+1] = valJ;
+    }
+    for (int ii = 0; ii < 5; ii++) {
+        RealOpenMM valI = 0.0;
+        RealOpenMM valJ = 0.0;
+        for (int jj = 0; jj < 5; jj++) {
+            valI += qiRotationMatrix2[ii][jj] * particleI.sphericalQuadrupole[jj];
+            valJ += qiRotationMatrix2[ii][jj] * particleJ.sphericalQuadrupole[jj];
+        }
+        qiQI[ii+4] = valI;
+        qiQJ[ii+4] = valJ;
    }
-    RealOpenMM dsc3 = 1.0 - scale3*scalingFactors[D_SCALE];
+    // The Qtilde{x,y,z} torque intermediates for atoms I and J, which are used to obtain the torques on the permanent moments.
-    RealOpenMM dsc5 = 1.0 - scale5*scalingFactors[D_SCALE];
+    RealOpenMM qiQIX[9] = {0.0, qiQI[3], 0.0, -qiQI[1], sqrtThree*qiQI[6], qiQI[8], -sqrtThree*qiQI[4] - qiQI[7], qiQI[6], -qiQI[5]};
-    RealOpenMM dsc7 = 1.0 - scale7*scalingFactors[D_SCALE];
+    RealOpenMM qiQIY[9] = {0.0, -qiQI[2], qiQI[1], 0.0, -sqrtThree*qiQI[5], sqrtThree*qiQI[4] - qiQI[7], -qiQI[8], qiQI[5], qiQI[6]};
+    RealOpenMM qiQIZ[9] = {0.0, 0.0, -qiQI[3], qiQI[2], 0.0, -qiQI[6], qiQI[5], -2.0*qiQI[8], 2.0*qiQI[7]};
-    RealOpenMM psc3 = 1.0 - scale3*scalingFactors[P_SCALE];
+    RealOpenMM qiQJX[9] = {0.0, qiQJ[3], 0.0, -qiQJ[1], sqrtThree*qiQJ[6], qiQJ[8], -sqrtThree*qiQJ[4] - qiQJ[7], qiQJ[6], -qiQJ[5]};
-    RealOpenMM psc5 = 1.0 - scale5*scalingFactors[P_SCALE];
+    RealOpenMM qiQJY[9] = {0.0, -qiQJ[2], qiQJ[1], 0.0, -sqrtThree*qiQJ[5], sqrtThree*qiQJ[4] - qiQJ[7], -qiQJ[8], qiQJ[5], qiQJ[6]};
-    RealOpenMM psc7 = 1.0 - scale7*scalingFactors[P_SCALE];
+    RealOpenMM qiQJZ[9] = {0.0, 0.0, -qiQJ[3], qiQJ[2], 0.0, -qiQJ[6], qiQJ[5], -2.0*qiQJ[8], 2.0*qiQJ[7]};
-    RealOpenMM usc3 = 1.0 - scale3*scalingFactors[U_SCALE];
+    // The field derivatives at I due to permanent and induced moments on J, and vice-versa.
-    RealOpenMM usc5 = 1.0 - scale5*scalingFactors[U_SCALE];
+    // Also, their derivatives w.r.t. R, which are needed for force calculations
+    RealOpenMM Vij[9], Vji[9], VjiR[9], VijR[9];
-    // construct necessary auxiliary vectors
+    // The field derivatives at I due to only permanent moments on J, and vice-versa.
+    RealOpenMM Vijp[3], Vijd[3], Vjip[3], Vjid[3];
-    RealOpenMM dixdk1       = di2*dk3 - di3*dk2;
+    RealOpenMM rInvVec[7], alphaRVec[8], bVec[5];
-    RealOpenMM dixdk2       = di3*dk1 - di1*dk3;
-    RealOpenMM dixdk3       = di1*dk2 - di2*dk1;
+    RealOpenMM prefac = (_electric/_dielectric);
+    RealOpenMM rInv = 1.0 / r;
-    RealOpenMM dixuk1       = di2*_inducedDipole[jIndex][2]  - di3*_inducedDipole[jIndex][1];
-    RealOpenMM dixuk2       = di3*_inducedDipole[jIndex][0]  - di1*_inducedDipole[jIndex][2];
+    // The rInvVec array is defined such that the ith element is R^-i, with the
-    RealOpenMM dixuk3       = di1*_inducedDipole[jIndex][1]  - di2*_inducedDipole[jIndex][0];
+    // dieleectric constant folded in, to avoid conversions later.
-    RealOpenMM dkxui1       = dk2*_inducedDipole[iIndex][2]  - dk3*_inducedDipole[iIndex][1];
+    rInvVec[1] = prefac * rInv;
-    RealOpenMM dkxui2       = dk3*_inducedDipole[iIndex][0]  - dk1*_inducedDipole[iIndex][2];
+    for(int i = 2; i < 7; ++i)
-    RealOpenMM dkxui3       = dk1*_inducedDipole[iIndex][1]  - dk2*_inducedDipole[iIndex][0];
+        rInvVec[i] = rInvVec[i-1] * rInv;
-    RealOpenMM dixukp1      = di2*_inducedDipolePolar[jIndex][2] - di3*_inducedDipolePolar[jIndex][1];
+    // The alpharVec array is defined such that the ith element is (alpha R)^i,
-    RealOpenMM dixukp2      = di3*_inducedDipolePolar[jIndex][0] - di1*_inducedDipolePolar[jIndex][2];
+    // where kappa (alpha in OpenMM parlance) is the Ewald attenuation parameter.
-    RealOpenMM dixukp3      = di1*_inducedDipolePolar[jIndex][1] - di2*_inducedDipolePolar[jIndex][0];
+    alphaRVec[1] = _alphaEwald * r;
-    RealOpenMM dkxuip1      = dk2*_inducedDipolePolar[iIndex][2] - dk3*_inducedDipolePolar[iIndex][1];
+    for(int i = 2; i < 8; ++i)
-    RealOpenMM dkxuip2      = dk3*_inducedDipolePolar[iIndex][0] - dk1*_inducedDipolePolar[iIndex][2];
+        alphaRVec[i] = alphaRVec[i-1] * alphaRVec[1];
-    RealOpenMM dkxuip3      = dk1*_inducedDipolePolar[iIndex][1] - dk2*_inducedDipolePolar[iIndex][0];
+    RealOpenMM erfAlphaR = erf(alphaRVec[1]);
-    RealOpenMM dixr1        = di2*zr - di3*yr;
+    RealOpenMM X = 2.0*EXP(-alphaRVec[2])/SQRT_PI;
-    RealOpenMM dixr2        = di3*xr - di1*zr;
+    RealOpenMM mScale = scalingFactors[M_SCALE];
-    RealOpenMM dixr3        = di1*yr - di2*xr;
+    RealOpenMM dScale = scalingFactors[D_SCALE];
+    RealOpenMM pScale = scalingFactors[P_SCALE];
-    RealOpenMM dkxr1        = dk2*zr - dk3*yr;
+    RealOpenMM uScale = scalingFactors[U_SCALE];
-    RealOpenMM dkxr2        = dk3*xr - dk1*zr;
-    RealOpenMM dkxr3        = dk1*yr - dk2*xr;
+    int doubleFactorial = 1, facCount = 1;
+    RealOpenMM tmp = alphaRVec[1];
-    RealOpenMM qir1         = qi1*xr + qi2*yr + qi3*zr;
+    bVec[1] = -erfAlphaR;
-    RealOpenMM qir2         = qi2*xr + qi5*yr + qi6*zr;
+    for(int i=2; i < 5; ++i){
-    RealOpenMM qir3         = qi3*xr + qi6*yr + qi9*zr;
+        bVec[i] = bVec[i-1] + tmp * X / (RealOpenMM)(doubleFactorial);
+        facCount = facCount + 2;
-    RealOpenMM qkr1         = qk1*xr + qk2*yr + qk3*zr;
+        doubleFactorial = doubleFactorial * facCount;
-    RealOpenMM qkr2         = qk2*xr + qk5*yr + qk6*zr;
+        tmp *= 2.0 * alphaRVec[2];
-    RealOpenMM qkr3         = qk3*xr + qk6*yr + qk9*zr;
+    }
-    RealOpenMM qiqkr1       = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
-    RealOpenMM qiqkr2       = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
-    RealOpenMM qiqkr3       = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
-    RealOpenMM qkqir1       = qk1*qir1 + qk2*qir2 + qk3*qir3;
-    RealOpenMM qkqir2       = qk2*qir1 + qk5*qir2 + qk6*qir3;
-    RealOpenMM qkqir3       = qk3*qir1 + qk6*qir2 + qk9*qir3;
-    RealOpenMM qixqk1       = qi2*qk3 + qi5*qk6 + qi6*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk6;
-    RealOpenMM qixqk2       = qi3*qk1 + qi6*qk2 + qi9*qk3 - qi1*qk3 - qi2*qk6 - qi3*qk9;
-    RealOpenMM qixqk3       = qi1*qk2 + qi2*qk5 + qi3*qk6 - qi2*qk1 - qi5*qk2 - qi6*qk3;
-    RealOpenMM rxqir1       = yr*qir3 - zr*qir2;
-    RealOpenMM rxqir2       = zr*qir1 - xr*qir3;
-    RealOpenMM rxqir3       = xr*qir2 - yr*qir1;
-    RealOpenMM rxqkr1       = yr*qkr3 - zr*qkr2;
-    RealOpenMM rxqkr2       = zr*qkr1 - xr*qkr3;
-    RealOpenMM rxqkr3       = xr*qkr2 - yr*qkr1;
-    RealOpenMM rxqikr1      = yr*qiqkr3 - zr*qiqkr2;
-    RealOpenMM rxqikr2      = zr*qiqkr1 - xr*qiqkr3;
-    RealOpenMM rxqikr3      = xr*qiqkr2 - yr*qiqkr1;
-    RealOpenMM rxqkir1      = yr*qkqir3 - zr*qkqir2;
-    RealOpenMM rxqkir2      = zr*qkqir1 - xr*qkqir3;
-    RealOpenMM rxqkir3      = xr*qkqir2 - yr*qkqir1;
-    RealOpenMM qkrxqir1     = qkr2*qir3 - qkr3*qir2;
-    RealOpenMM qkrxqir2     = qkr3*qir1 - qkr1*qir3;
-    RealOpenMM qkrxqir3     = qkr1*qir2 - qkr2*qir1;
-    RealOpenMM qidk1        = qi1*dk1 + qi2*dk2 + qi3*dk3;
-    RealOpenMM qidk2        = qi2*dk1 + qi5*dk2 + qi6*dk3;
-    RealOpenMM qidk3        = qi3*dk1 + qi6*dk2 + qi9*dk3;
-    RealOpenMM qkdi1        = qk1*di1 + qk2*di2 + qk3*di3;
-    RealOpenMM qkdi2        = qk2*di1 + qk5*di2 + qk6*di3;
-    RealOpenMM qkdi3        = qk3*di1 + qk6*di2 + qk9*di3;
-    RealOpenMM qiuk1        = qi1*_inducedDipole[jIndex][0]  + qi2*_inducedDipole[jIndex][1]  + qi3*_inducedDipole[jIndex][2];
-    RealOpenMM qiuk2        = qi2*_inducedDipole[jIndex][0]  + qi5*_inducedDipole[jIndex][1]  + qi6*_inducedDipole[jIndex][2];
-    RealOpenMM qiuk3        = qi3*_inducedDipole[jIndex][0]  + qi6*_inducedDipole[jIndex][1]  + qi9*_inducedDipole[jIndex][2];
-    RealOpenMM qkui1        = qk1*_inducedDipole[iIndex][0]  + qk2*_inducedDipole[iIndex][1]  + qk3*_inducedDipole[iIndex][2];
-    RealOpenMM qkui2        = qk2*_inducedDipole[iIndex][0]  + qk5*_inducedDipole[iIndex][1]  + qk6*_inducedDipole[iIndex][2];
-    RealOpenMM qkui3        = qk3*_inducedDipole[iIndex][0]  + qk6*_inducedDipole[iIndex][1]  + qk9*_inducedDipole[iIndex][2];
-    RealOpenMM qiukp1       = qi1*_inducedDipolePolar[jIndex][0] + qi2*_inducedDipolePolar[jIndex][1] + qi3*_inducedDipolePolar[jIndex][2];
-    RealOpenMM qiukp2       = qi2*_inducedDipolePolar[jIndex][0] + qi5*_inducedDipolePolar[jIndex][1] + qi6*_inducedDipolePolar[jIndex][2];
-    RealOpenMM qiukp3       = qi3*_inducedDipolePolar[jIndex][0] + qi6*_inducedDipolePolar[jIndex][1] + qi9*_inducedDipolePolar[jIndex][2];
-    RealOpenMM qkuip1       = qk1*_inducedDipolePolar[iIndex][0] + qk2*_inducedDipolePolar[iIndex][1] + qk3*_inducedDipolePolar[iIndex][2];
-    RealOpenMM qkuip2       = qk2*_inducedDipolePolar[iIndex][0] + qk5*_inducedDipolePolar[iIndex][1] + qk6*_inducedDipolePolar[iIndex][2];
-    RealOpenMM qkuip3       = qk3*_inducedDipolePolar[iIndex][0] + qk6*_inducedDipolePolar[iIndex][1] + qk9*_inducedDipolePolar[iIndex][2];
-    RealOpenMM dixqkr1      = di2*qkr3 - di3*qkr2;
-    RealOpenMM dixqkr2      = di3*qkr1 - di1*qkr3;
-    RealOpenMM dixqkr3      = di1*qkr2 - di2*qkr1;
-    RealOpenMM dkxqir1      = dk2*qir3 - dk3*qir2;
-    RealOpenMM dkxqir2      = dk3*qir1 - dk1*qir3;
-    RealOpenMM dkxqir3      = dk1*qir2 - dk2*qir1;
-    RealOpenMM uixqkr1      = _inducedDipole[iIndex][1]*qkr3 - _inducedDipole[iIndex][2]*qkr2;
-    RealOpenMM uixqkr2      = _inducedDipole[iIndex][2]*qkr1 - _inducedDipole[iIndex][0]*qkr3;
-    RealOpenMM uixqkr3      = _inducedDipole[iIndex][0]*qkr2 - _inducedDipole[iIndex][1]*qkr1;
-    RealOpenMM ukxqir1      = _inducedDipole[jIndex][1]*qir3 - _inducedDipole[jIndex][2]*qir2;
-    RealOpenMM ukxqir2      = _inducedDipole[jIndex][2]*qir1 - _inducedDipole[jIndex][0]*qir3;
-    RealOpenMM ukxqir3      = _inducedDipole[jIndex][0]*qir2 - _inducedDipole[jIndex][1]*qir1;
-    RealOpenMM uixqkrp1     = _inducedDipolePolar[iIndex][1]*qkr3 - _inducedDipolePolar[iIndex][2]*qkr2;
-    RealOpenMM uixqkrp2     = _inducedDipolePolar[iIndex][2]*qkr1 - _inducedDipolePolar[iIndex][0]*qkr3;
-    RealOpenMM uixqkrp3     = _inducedDipolePolar[iIndex][0]*qkr2 - _inducedDipolePolar[iIndex][1]*qkr1;
-    RealOpenMM ukxqirp1     = _inducedDipolePolar[jIndex][1]*qir3 - _inducedDipolePolar[jIndex][2]*qir2;
-    RealOpenMM ukxqirp2     = _inducedDipolePolar[jIndex][2]*qir1 - _inducedDipolePolar[jIndex][0]*qir3;
-    RealOpenMM ukxqirp3     = _inducedDipolePolar[jIndex][0]*qir2 - _inducedDipolePolar[jIndex][1]*qir1;
-    RealOpenMM rxqidk1      = yr*qidk3 - zr*qidk2;
-    RealOpenMM rxqidk2      = zr*qidk1 - xr*qidk3;
-    RealOpenMM rxqidk3      = xr*qidk2 - yr*qidk1;
-    RealOpenMM rxqkdi1      = yr*qkdi3 - zr*qkdi2;
-    RealOpenMM rxqkdi2      = zr*qkdi1 - xr*qkdi3;
-    RealOpenMM rxqkdi3      = xr*qkdi2 - yr*qkdi1;
-    RealOpenMM rxqiuk1      = yr*qiuk3 - zr*qiuk2;
-    RealOpenMM rxqiuk2      = zr*qiuk1 - xr*qiuk3;
-    RealOpenMM rxqiuk3      = xr*qiuk2 - yr*qiuk1;
-    RealOpenMM rxqkui1      = yr*qkui3 - zr*qkui2;
-    RealOpenMM rxqkui2      = zr*qkui1 - xr*qkui3;
-    RealOpenMM rxqkui3      = xr*qkui2 - yr*qkui1;
-    RealOpenMM rxqiukp1     = yr*qiukp3 - zr*qiukp2;
-    RealOpenMM rxqiukp2     = zr*qiukp1 - xr*qiukp3;
-    RealOpenMM rxqiukp3     = xr*qiukp2 - yr*qiukp1;
-    RealOpenMM rxqkuip1     = yr*qkuip3 - zr*qkuip2;
-    RealOpenMM rxqkuip2     = zr*qkuip1 - xr*qkuip3;
-    RealOpenMM rxqkuip3     = xr*qkuip2 - yr*qkuip1;
-    // calculate the scalar products for permanent components
-    RealOpenMM sc2          = di1*dk1 + di2*dk2 + di3*dk3;
-    RealOpenMM sc3          = di1*xr + di2*yr + di3*zr;
-    RealOpenMM sc4          = dk1*xr + dk2*yr + dk3*zr;
-    RealOpenMM sc5          = qir1*xr + qir2*yr + qir3*zr;
-    RealOpenMM sc6          = qkr1*xr + qkr2*yr + qkr3*zr;
-    RealOpenMM sc7          = qir1*dk1 + qir2*dk2 + qir3*dk3;
-    RealOpenMM sc8          = qkr1*di1 + qkr2*di2 + qkr3*di3;
-    RealOpenMM sc9          = qir1*qkr1 + qir2*qkr2 + qir3*qkr3;
-    RealOpenMM sc10         = qi1*qk1 + qi2*qk2 + qi3*qk3
-                         + qi2*qk2 + qi5*qk5 + qi6*qk6
-                         + qi3*qk3 + qi6*qk6 + qi9*qk9;
-    // calculate the scalar products for induced components
-    RealOpenMM sci1         = _inducedDipole[iIndex][0]*dk1 + _inducedDipole[iIndex][1]*dk2
-                         + _inducedDipole[iIndex][2]*dk3 + di1*_inducedDipole[jIndex][0]
-                         + di2*_inducedDipole[jIndex][1] + di3*_inducedDipole[jIndex][2];
-    RealOpenMM sci3         = _inducedDipole[iIndex][0]*xr + _inducedDipole[iIndex][1]*yr + _inducedDipole[iIndex][2]*zr;
-    RealOpenMM sci4         = _inducedDipole[jIndex][0]*xr + _inducedDipole[jIndex][1]*yr + _inducedDipole[jIndex][2]*zr;
-    RealOpenMM sci7         = qir1*_inducedDipole[jIndex][0] + qir2*_inducedDipole[jIndex][1] + qir3*_inducedDipole[jIndex][2];
-    RealOpenMM sci8         = qkr1*_inducedDipole[iIndex][0] + qkr2*_inducedDipole[iIndex][1] + qkr3*_inducedDipole[iIndex][2];
-    RealOpenMM scip1        = _inducedDipolePolar[iIndex][0]*dk1 + _inducedDipolePolar[iIndex][1]*dk2 + _inducedDipolePolar[iIndex][2]*dk3 + di1*_inducedDipolePolar[jIndex][0] + di2*_inducedDipolePolar[jIndex][1] + di3*_inducedDipolePolar[jIndex][2];
-    RealOpenMM scip2        = _inducedDipole[iIndex][0]*_inducedDipolePolar[jIndex][0]+_inducedDipole[iIndex][1]*_inducedDipolePolar[jIndex][1]
-                          + _inducedDipole[iIndex][2]*_inducedDipolePolar[jIndex][2]+_inducedDipolePolar[iIndex][0]*_inducedDipole[jIndex][0]
-                          + _inducedDipolePolar[iIndex][1]*_inducedDipole[jIndex][1]+_inducedDipolePolar[iIndex][2]*_inducedDipole[jIndex][2];
-    RealOpenMM scip3        = _inducedDipolePolar[iIndex][0]*xr + _inducedDipolePolar[iIndex][1]*yr + _inducedDipolePolar[iIndex][2]*zr;
-    RealOpenMM scip4        = _inducedDipolePolar[jIndex][0]*xr + _inducedDipolePolar[jIndex][1]*yr + _inducedDipolePolar[jIndex][2]*zr;
-    RealOpenMM scip7        = qir1*_inducedDipolePolar[jIndex][0] + qir2*_inducedDipolePolar[jIndex][1] + qir3*_inducedDipolePolar[jIndex][2];
-    RealOpenMM scip8        = qkr1*_inducedDipolePolar[iIndex][0] + qkr2*_inducedDipolePolar[iIndex][1] + qkr3*_inducedDipolePolar[iIndex][2];
-    // calculate the gl functions for permanent components
-    RealOpenMM gl0           = ci*ck;
-    RealOpenMM gl1           = ck*sc3 - ci*sc4;
-    RealOpenMM gl2           = ci*sc6 + ck*sc5 - sc3*sc4;
-    RealOpenMM gl3           = sc3*sc6 - sc4*sc5;
-    RealOpenMM gl4           = sc5*sc6;
-    RealOpenMM gl5           = -4.0*sc9;
-    RealOpenMM gl6           = sc2;
-    RealOpenMM gl7           = 2.0 * (sc7-sc8);
-    RealOpenMM gl8           = 2.0 * sc10;
-    // calculate the gl functions for induced components
-    RealOpenMM gli1          = ck*sci3 - ci*sci4;
-    RealOpenMM gli2          = -sc3*sci4 - sci3*sc4;
-    RealOpenMM gli3          = sci3*sc6 - sci4*sc5;
-    RealOpenMM gli6          = sci1;
-    RealOpenMM gli7          = 2.0 * (sci7-sci8);
-    RealOpenMM glip1         = ck*scip3 - ci*scip4;
-    RealOpenMM glip2         = -sc3*scip4 - scip3*sc4;
-    RealOpenMM glip3         = scip3*sc6 - scip4*sc5;
-    RealOpenMM glip6         = scip1;
-    RealOpenMM glip7         = 2.0 * (scip7-scip8);
-    // compute the energy contributions for this interaction
-    RealOpenMM e             = bn0*gl0 + bn1*(gl1+gl6) + bn2*(gl2+gl7+gl8) + bn3*(gl3+gl5) + bn4*gl4;
-    RealOpenMM ei            = 0.5 * (bn1*(gli1+gli6) + bn2*(gli2+gli7) + bn3*gli3); 
-    // get the real energy without any screening function
-    RealOpenMM erl           = rr1*gl0 + rr3*(gl1+gl6) + rr5*(gl2+gl7+gl8) + rr7*(gl3+gl5) + rr9*gl4;
-    RealOpenMM erli          = 0.5*(rr3*(gli1+gli6)*psc3 + rr5*(gli2+gli7)*psc5 + rr7*gli3*psc7);
-    e                   = e - (1.0-scalingFactors[M_SCALE])*erl;
-    ei                  = ei - erli;
-    energy              = (e + ei);
-    // intermediate variables for permanent force terms
-    RealOpenMM gf1 = bn1*gl0 + bn2*(gl1+gl6)
-                 + bn3*(gl2+gl7+gl8)
-                 + bn4*(gl3+gl5) + bn5*gl4;
-    RealOpenMM gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
-    RealOpenMM gf3 = ci*bn1 + sc3*bn2 + sc5*bn3;
-    RealOpenMM gf4 = 2.0*bn2;
-    RealOpenMM gf5 = 2.0*(-ck*bn2+sc4*bn3-sc6*bn4);
-    RealOpenMM gf6 = 2.0*(-ci*bn2-sc3*bn3-sc5*bn4);
-    RealOpenMM gf7 = 4.0*bn3;
-    RealOpenMM gfr1 = rr3*gl0 + rr5*(gl1+gl6)
-                  + rr7*(gl2+gl7+gl8)
-                  + rr9*(gl3+gl5) + rr11*gl4;
-    RealOpenMM gfr2 = -ck*rr3 + sc4*rr5 - sc6*rr7;
-    RealOpenMM gfr3 = ci*rr3 + sc3*rr5 + sc5*rr7;
-    RealOpenMM gfr4 = 2.0*rr5;
-    RealOpenMM gfr5 = 2.0*(-ck*rr5+sc4*rr7-sc6*rr9);
-    RealOpenMM gfr6 = 2.0*(-ci*rr5-sc3*rr7-sc5*rr9);
-    RealOpenMM gfr7 = 4.0*rr7;
-    // intermediate variables for induced force terms
-    RealOpenMM gfi1 = 0.5*(bn2*(gli1+glip1+gli6+glip6)
-                  + bn2*scip2
-                  + bn3*(gli2+glip2+gli7+glip7)
-                  - bn3*(sci3*scip4+scip3*sci4)
-                  + bn4*(gli3+glip3));
-    RealOpenMM gfi2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
-    RealOpenMM gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
-    RealOpenMM gfi4 = 2.0 * bn2;
-    RealOpenMM gfi5 = bn3 * (sci4+scip4);
-    RealOpenMM gfi6 = -bn3 * (sci3+scip3);
-    RealOpenMM gfri1 = 0.5*(rr5*((gli1+gli6)*psc3
-                         + (glip1+glip6)*dsc3
-                         + scip2*usc3)
-              + rr7*((gli7+gli2)*psc5
-                         + (glip7+glip2)*dsc5
-                  - (sci3*scip4+scip3*sci4)*usc5)
-              + rr9*(gli3*psc7+glip3*dsc7));
-    RealOpenMM gfri4 = 2.0 * rr5;
-    RealOpenMM gfri5 = rr7 * (sci4*psc7+scip4*dsc7);
-    RealOpenMM gfri6 = -rr7 * (sci3*psc7+scip3*dsc7);
-    // get the permanent force with screening
-    RealOpenMM ftm21 = gf1*xr + gf2*di1 + gf3*dk1
-                   + gf4*(qkdi1-qidk1) + gf5*qir1
-                   + gf6*qkr1 + gf7*(qiqkr1+qkqir1);
-    RealOpenMM ftm22 = gf1*yr + gf2*di2 + gf3*dk2
-                   + gf4*(qkdi2-qidk2) + gf5*qir2
-                   + gf6*qkr2 + gf7*(qiqkr2+qkqir2);
-    RealOpenMM ftm23 = gf1*zr + gf2*di3 + gf3*dk3
-                   + gf4*(qkdi3-qidk3) + gf5*qir3
-                   + gf6*qkr3 + gf7*(qiqkr3+qkqir3);
-    // get the permanent force without screening
-    RealOpenMM ftm2r1 = gfr1*xr + gfr2*di1 + gfr3*dk1
-                   + gfr4*(qkdi1-qidk1) + gfr5*qir1
-                   + gfr6*qkr1 + gfr7*(qiqkr1+qkqir1);
-    RealOpenMM ftm2r2 = gfr1*yr + gfr2*di2 + gfr3*dk2
-                   + gfr4*(qkdi2-qidk2) + gfr5*qir2
-                   + gfr6*qkr2 + gfr7*(qiqkr2+qkqir2);
-    RealOpenMM ftm2r3 = gfr1*zr + gfr2*di3 + gfr3*dk3
-                   + gfr4*(qkdi3-qidk3) + gfr5*qir3
-                   + gfr6*qkr3 + gfr7*(qiqkr3+qkqir3);
-    // get the induced force with screening
-    RealOpenMM ftm2i1 = gfi1*xr + 0.5*
-          (gfi2*(_inducedDipole[iIndex][0]+_inducedDipolePolar[iIndex][0])
-         + bn2*(sci4*_inducedDipolePolar[iIndex][0]+scip4*_inducedDipole[iIndex][0])
-         + gfi3*(_inducedDipole[jIndex][0]+_inducedDipolePolar[jIndex][0])
-         + bn2*(sci3*_inducedDipolePolar[jIndex][0]+scip3*_inducedDipole[jIndex][0])
-         + (sci4+scip4)*bn2*di1
-         + (sci3+scip3)*bn2*dk1
-         + gfi4*(qkui1+qkuip1-qiuk1-qiukp1))
-         + gfi5*qir1 + gfi6*qkr1;
-    RealOpenMM ftm2i2 = gfi1*yr + 0.5*
-          (gfi2*(_inducedDipole[iIndex][1]+_inducedDipolePolar[iIndex][1])
-         + bn2*(sci4*_inducedDipolePolar[iIndex][1]+scip4*_inducedDipole[iIndex][1])
-         + gfi3*(_inducedDipole[jIndex][1]+_inducedDipolePolar[jIndex][1])
-         + bn2*(sci3*_inducedDipolePolar[jIndex][1]+scip3*_inducedDipole[jIndex][1])
-         + (sci4+scip4)*bn2*di2
-         + (sci3+scip3)*bn2*dk2
-         + gfi4*(qkui2+qkuip2-qiuk2-qiukp2))
-         + gfi5*qir2 + gfi6*qkr2;
-    RealOpenMM ftm2i3 = gfi1*zr + 0.5*
-          (gfi2*(_inducedDipole[iIndex][2]+_inducedDipolePolar[iIndex][2])
-         + bn2*(sci4*_inducedDipolePolar[iIndex][2]+scip4*_inducedDipole[iIndex][2])
-         + gfi3*(_inducedDipole[jIndex][2]+_inducedDipolePolar[jIndex][2])
-         + bn2*(sci3*_inducedDipolePolar[jIndex][2]+scip3*_inducedDipole[jIndex][2])
-         + (sci4+scip4)*bn2*di3
-         + (sci3+scip3)*bn2*dk3
-         + gfi4*(qkui3+qkuip3-qiuk3-qiukp3))
-         + gfi5*qir3 + gfi6*qkr3;
-    // get the induced force without screening
-    RealOpenMM ftm2ri1 = gfri1*xr + 0.5*
-        (- rr3*ck*(_inducedDipole[iIndex][0]*psc3+_inducedDipolePolar[iIndex][0]*dsc3)
-         + rr5*sc4*(_inducedDipole[iIndex][0]*psc5+_inducedDipolePolar[iIndex][0]*dsc5)
-         - rr7*sc6*(_inducedDipole[iIndex][0]*psc7+_inducedDipolePolar[iIndex][0]*dsc7))
-         + (rr3*ci*(_inducedDipole[jIndex][0]*psc3+_inducedDipolePolar[jIndex][0]*dsc3)
-         + rr5*sc3*(_inducedDipole[jIndex][0]*psc5+_inducedDipolePolar[jIndex][0]*dsc5)
-         + rr7*sc5*(_inducedDipole[jIndex][0]*psc7+_inducedDipolePolar[jIndex][0]*dsc7))*0.5
-         + rr5*usc5*(sci4*_inducedDipolePolar[iIndex][0]+scip4*_inducedDipole[iIndex][0]
-         + sci3*_inducedDipolePolar[jIndex][0]+scip3*_inducedDipole[jIndex][0])*0.5
-         + 0.5*(sci4*psc5+scip4*dsc5)*rr5*di1
-         + 0.5*(sci3*psc5+scip3*dsc5)*rr5*dk1
-         + 0.5*gfri4*((qkui1-qiuk1)*psc5
-         + (qkuip1-qiukp1)*dsc5)
-         + gfri5*qir1 + gfri6*qkr1;
-    RealOpenMM ftm2ri2 = gfri1*yr + 0.5*
-        (- rr3*ck*(_inducedDipole[iIndex][1]*psc3+_inducedDipolePolar[iIndex][1]*dsc3)
-         + rr5*sc4*(_inducedDipole[iIndex][1]*psc5+_inducedDipolePolar[iIndex][1]*dsc5)
-         - rr7*sc6*(_inducedDipole[iIndex][1]*psc7+_inducedDipolePolar[iIndex][1]*dsc7))
-         + (rr3*ci*(_inducedDipole[jIndex][1]*psc3+_inducedDipolePolar[jIndex][1]*dsc3)
-         + rr5*sc3*(_inducedDipole[jIndex][1]*psc5+_inducedDipolePolar[jIndex][1]*dsc5)
-         + rr7*sc5*(_inducedDipole[jIndex][1]*psc7+_inducedDipolePolar[jIndex][1]*dsc7))*0.5
-         + rr5*usc5*(sci4*_inducedDipolePolar[iIndex][1]+scip4*_inducedDipole[iIndex][1]
-         + sci3*_inducedDipolePolar[jIndex][1]+scip3*_inducedDipole[jIndex][1])*0.5
-         + 0.5*(sci4*psc5+scip4*dsc5)*rr5*di2
-         + 0.5*(sci3*psc5+scip3*dsc5)*rr5*dk2
-         + 0.5*gfri4*((qkui2-qiuk2)*psc5
-         + (qkuip2-qiukp2)*dsc5)
-         + gfri5*qir2 + gfri6*qkr2;
-    RealOpenMM ftm2ri3 = gfri1*zr + 0.5*
-        (- rr3*ck*(_inducedDipole[iIndex][2]*psc3+_inducedDipolePolar[iIndex][2]*dsc3)
-         + rr5*sc4*(_inducedDipole[iIndex][2]*psc5+_inducedDipolePolar[iIndex][2]*dsc5)
-         - rr7*sc6*(_inducedDipole[iIndex][2]*psc7+_inducedDipolePolar[iIndex][2]*dsc7))
-         + (rr3*ci*(_inducedDipole[jIndex][2]*psc3+_inducedDipolePolar[jIndex][2]*dsc3)
-         + rr5*sc3*(_inducedDipole[jIndex][2]*psc5+_inducedDipolePolar[jIndex][2]*dsc5)
-         + rr7*sc5*(_inducedDipole[jIndex][2]*psc7+_inducedDipolePolar[jIndex][2]*dsc7))*0.5
-         + rr5*usc5*(sci4*_inducedDipolePolar[iIndex][2]+scip4*_inducedDipole[iIndex][2]
-         + sci3*_inducedDipolePolar[jIndex][2]+scip3*_inducedDipole[jIndex][2])*0.5
-         + 0.5*(sci4*psc5+scip4*dsc5)*rr5*di3
-         + 0.5*(sci3*psc5+scip3*dsc5)*rr5*dk3
-         + 0.5*gfri4*((qkui3-qiuk3)*psc5
-         + (qkuip3-qiukp3)*dsc5)
-         + gfri5*qir3 + gfri6*qkr3;
-    // account for partially excluded induced interactions
-    RealOpenMM temp3 = 0.5 * rr3 * ((gli1+gli6)*scalingFactors[P_SCALE]
-                               +(glip1+glip6)*scalingFactors[D_SCALE]);
-    RealOpenMM temp5 = 0.5 * rr5 * ((gli2+gli7)*scalingFactors[P_SCALE]
-                               +(glip2+glip7)*scalingFactors[D_SCALE]);
-    RealOpenMM temp7 = 0.5 * rr7 * (gli3*scalingFactors[P_SCALE]
-                               +glip3*scalingFactors[D_SCALE]);
-    RealOpenMM fridmp1 = temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71;
-    RealOpenMM fridmp2 = temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72;
-    RealOpenMM fridmp3 = temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73;
-    // find some scaling terms for induced-induced force
-    temp3         = 0.5 * rr3 * scalingFactors[U_SCALE] * scip2;
-    temp5         = -0.5 * rr5 * scalingFactors[U_SCALE] * (sci3*scip4+scip3*sci4);
-    RealOpenMM findmp1 = temp3*ddsc31 + temp5*ddsc51;
-    RealOpenMM findmp2 = temp3*ddsc32 + temp5*ddsc52;
-    RealOpenMM findmp3 = temp3*ddsc33 + temp5*ddsc53;
-    // modify the forces for partially excluded interactions
-    ftm2i1       -= (fridmp1 + findmp1);
-    ftm2i2       -= (fridmp2 + findmp2);
-    ftm2i3       -= (fridmp3 + findmp3);
-    // correction to convert mutual to direct polarization force
-    if (getPolarizationType() == AmoebaReferenceMultipoleForce::Direct) {
-       RealOpenMM gfd     = 0.5 * (bn2*scip2 - bn3*(scip3*sci4+sci3*scip4));
-       ftm2i1       -= gfd*xr + 0.5*bn2*(sci4*_inducedDipolePolar[iIndex][0]+scip4*_inducedDipole[iIndex][0]+sci3*_inducedDipolePolar[jIndex][0]+scip3*_inducedDipole[jIndex][0]);
-       ftm2i2       -= gfd*yr + 0.5*bn2*(sci4*_inducedDipolePolar[iIndex][1]+scip4*_inducedDipole[iIndex][1]+sci3*_inducedDipolePolar[jIndex][1]+scip3*_inducedDipole[jIndex][1]);
-       ftm2i3       -= gfd*zr + 0.5*bn2*(sci4*_inducedDipolePolar[iIndex][2]+scip4*_inducedDipole[iIndex][2]+sci3*_inducedDipolePolar[jIndex][2]+scip3*_inducedDipole[jIndex][2]);
-       RealOpenMM gfdr    = 0.5 * (rr5*scip2*usc3 - rr7*(scip3*sci4 +sci3*scip4)*usc5);
+    RealOpenMM dmp = particleI.dampingFactor*particleJ.dampingFactor;
-       RealOpenMM fdir1   = gfdr*xr + 0.5*usc5*rr5*(sci4*_inducedDipolePolar[iIndex][0]+scip4*_inducedDipole[iIndex][0] + sci3*_inducedDipolePolar[jIndex][0]+scip3*_inducedDipole[jIndex][0]);
+    RealOpenMM a = particleI.thole < particleJ.thole ? particleI.thole : particleJ.thole;
-       RealOpenMM fdir2   = gfdr*yr + 0.5*usc5*rr5*(sci4*_inducedDipolePolar[iIndex][1]+scip4*_inducedDipole[iIndex][1] + sci3*_inducedDipolePolar[jIndex][1]+scip3*_inducedDipole[jIndex][1]);
+    RealOpenMM u = std::abs(dmp) > 1.0E-5 ? r/dmp : 1E10;
-       RealOpenMM fdir3   = gfdr*zr + 0.5*usc5*rr5*(sci4*_inducedDipolePolar[iIndex][2]+scip4*_inducedDipole[iIndex][2] + sci3*_inducedDipolePolar[jIndex][2]+scip3*_inducedDipole[jIndex][2]);
+    RealOpenMM au3 = a*u*u*u;
+    RealOpenMM expau3 = au3 < 50.0 ? EXP(-au3) : 0.0;
+    RealOpenMM a2u6 = au3*au3;
+    RealOpenMM a3u9 = a2u6*au3;
+    // Thole damping factors for energies
+    RealOpenMM thole_c  = 1.0 - expau3;
+    RealOpenMM thole_d0 = 1.0 - expau3*(1.0 + 1.5*au3);
+    RealOpenMM thole_d1 = 1.0 - expau3;
+    RealOpenMM thole_q0 = 1.0 - expau3*(1.0 + au3 + a2u6);
+    RealOpenMM thole_q1 = 1.0 - expau3*(1.0 + au3);
+    // Thole damping factors for derivatives
+    RealOpenMM dthole_c  = 1.0 - expau3*(1.0 + 1.5*au3);
+    RealOpenMM dthole_d0 = 1.0 - expau3*(1.0 + au3 + 1.5*a2u6);
+    RealOpenMM dthole_d1 = 1.0 - expau3*(1.0 + au3);
+    RealOpenMM dthole_q0 = 1.0 - expau3*(1.0 + au3 + 0.25*a2u6 + 0.75*a3u9);
+    RealOpenMM dthole_q1 = 1.0 - expau3*(1.0 + au3 + 0.75*a2u6);
+    // Now we compute the (attenuated) Coulomb operator and its derivatives, contracted with
+    // permanent moments and induced dipoles.  Note that the coefficient of the permanent force
+    // terms is half of the expected value; this is because we compute the interaction of I with
+    // the sum of induced and permanent moments on J, as well as the interaction of J with I's
+    // permanent and induced moments; doing so double counts the permanent-permanent interaction.
+    RealOpenMM ePermCoef, dPermCoef, eUIndCoef, dUIndCoef, eUInpCoef, dUInpCoef;
+    // C-C terms (m=0)
+    ePermCoef = rInvVec[1]*(mScale + bVec[2] - alphaRVec[1]*X);
+    dPermCoef = -0.5*(mScale + bVec[2])*rInvVec[2];
+    Vij[0]  = ePermCoef*qiQJ[0];
+    Vji[0]  = ePermCoef*qiQI[0];
+    VijR[0] = dPermCoef*qiQJ[0];
+    VjiR[0] = dPermCoef*qiQI[0];
+    // C-D and C-Uind terms (m=0)
+    ePermCoef = rInvVec[2]*(mScale + bVec[2]);
+    eUIndCoef = rInvVec[2]*(pScale*thole_c + bVec[2]);
+    eUInpCoef = rInvVec[2]*(dScale*thole_c + bVec[2]);
+    dPermCoef = -rInvVec[3]*(mScale + bVec[2] + alphaRVec[3]*X);
+    dUIndCoef = -2.0*rInvVec[3]*(pScale*dthole_c + bVec[2] + alphaRVec[3]*X);
+    dUInpCoef = -2.0*rInvVec[3]*(dScale*dthole_c + bVec[2] + alphaRVec[3]*X);
+    Vij[0]  += -(ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ[0] + eUInpCoef*qiUinpJ[0]);
+    Vji[1]   = -(ePermCoef*qiQI[0]);
+    VijR[0] += -(dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ[0] + dUInpCoef*qiUinpJ[0]);
+    VjiR[1]  = -(dPermCoef*qiQI[0]);
+    Vjip[0]  = -(eUInpCoef*qiQI[0]);
+    Vjid[0]  = -(eUIndCoef*qiQI[0]);
+    // D-C and Uind-C terms (m=0)
+    Vij[1]   = ePermCoef*qiQJ[0];
+    Vji[0]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI[0] + eUInpCoef*qiUinpI[0];
+    VijR[1]  = dPermCoef*qiQJ[0];
+    VjiR[0] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI[0] + dUInpCoef*qiUinpI[0];
+    Vijp[0]  = eUInpCoef*qiQJ[0];
+    Vijd[0]  = eUIndCoef*qiQJ[0];
+    // D-D and D-Uind terms (m=0)
+    ePermCoef = -twoThirds*rInvVec[3]*(3.0*(mScale + bVec[3]) + alphaRVec[3]*X);
+    eUIndCoef = -twoThirds*rInvVec[3]*(3.0*(pScale*thole_d0 + bVec[3]) + alphaRVec[3]*X);
+    eUInpCoef = -twoThirds*rInvVec[3]*(3.0*(dScale*thole_d0 + bVec[3]) + alphaRVec[3]*X);
+    dPermCoef = rInvVec[4]*(3.0*(mScale + bVec[3]) + 2.*alphaRVec[5]*X);
+    dUIndCoef = rInvVec[4]*(6.0*(pScale*dthole_d0 + bVec[3]) + 4.0*alphaRVec[5]*X);
+    dUInpCoef = rInvVec[4]*(6.0*(dScale*dthole_d0 + bVec[3]) + 4.0*alphaRVec[5]*X);
+    Vij[1]  += ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ[0] + eUInpCoef*qiUinpJ[0];
+    Vji[1]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI[0] + eUInpCoef*qiUinpI[0];
+    VijR[1] += dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ[0] + dUInpCoef*qiUinpJ[0];
+    VjiR[1] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI[0] + dUInpCoef*qiUinpI[0];
+    Vijp[0] += eUInpCoef*qiQJ[1];
+    Vijd[0] += eUIndCoef*qiQJ[1];
+    Vjip[0] += eUInpCoef*qiQI[1];
+    Vjid[0] += eUIndCoef*qiQI[1];
+    // D-D and D-Uind terms (m=1)
+    ePermCoef = rInvVec[3]*(mScale + bVec[3] - twoThirds*alphaRVec[3]*X);
+    eUIndCoef = rInvVec[3]*(pScale*thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+    eUInpCoef = rInvVec[3]*(dScale*thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+    dPermCoef = -1.5*rInvVec[4]*(mScale + bVec[3]);
+    dUIndCoef = -3.0*rInvVec[4]*(pScale*dthole_d1 + bVec[3]);
+    dUInpCoef = -3.0*rInvVec[4]*(dScale*dthole_d1 + bVec[3]);
+    Vij[2]  = ePermCoef*qiQJ[2] + eUIndCoef*qiUindJ[1] + eUInpCoef*qiUinpJ[1];
+    Vji[2]  = ePermCoef*qiQI[2] + eUIndCoef*qiUindI[1] + eUInpCoef*qiUinpI[1];
+    VijR[2] = dPermCoef*qiQJ[2] + dUIndCoef*qiUindJ[1] + dUInpCoef*qiUinpJ[1];
+    VjiR[2] = dPermCoef*qiQI[2] + dUIndCoef*qiUindI[1] + dUInpCoef*qiUinpI[1];
+    Vij[3]  = ePermCoef*qiQJ[3] + eUIndCoef*qiUindJ[2] + eUInpCoef*qiUinpJ[2];
+    Vji[3]  = ePermCoef*qiQI[3] + eUIndCoef*qiUindI[2] + eUInpCoef*qiUinpI[2];
+    VijR[3] = dPermCoef*qiQJ[3] + dUIndCoef*qiUindJ[2] + dUInpCoef*qiUinpJ[2];
+    VjiR[3] = dPermCoef*qiQI[3] + dUIndCoef*qiUindI[2] + dUInpCoef*qiUinpI[2];
+    Vijp[1] = eUInpCoef*qiQJ[2];
+    Vijd[1] = eUIndCoef*qiQJ[2];
+    Vjip[1] = eUInpCoef*qiQI[2];
+    Vjid[1] = eUIndCoef*qiQI[2];
+    Vijp[2] = eUInpCoef*qiQJ[3];
+    Vijd[2] = eUIndCoef*qiQJ[3];
+    Vjip[2] = eUInpCoef*qiQI[3];
+    Vjid[2] = eUIndCoef*qiQI[3];
+    // C-Q terms (m=0)
+    ePermCoef = (mScale + bVec[3])*rInvVec[3];
+    dPermCoef = -oneThird*rInvVec[4]*(4.5*(mScale + bVec[3]) + 2.0*alphaRVec[5]*X);
+    Vij[0]  += ePermCoef*qiQJ[4];
+    Vji[4]   = ePermCoef*qiQI[0];
+    VijR[0] += dPermCoef*qiQJ[4];
+    VjiR[4]  = dPermCoef*qiQI[0];
+    // Q-C terms (m=0)
+    Vij[4]   = ePermCoef*qiQJ[0];
+    Vji[0]  += ePermCoef*qiQI[4];
+    VijR[4]  = dPermCoef*qiQJ[0];
+    VjiR[0] += dPermCoef*qiQI[4];
+    // D-Q and Uind-Q terms (m=0)
+    ePermCoef = rInvVec[4]*(3.0*(mScale + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    eUIndCoef = rInvVec[4]*(3.0*(pScale*thole_q0 + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    eUInpCoef = rInvVec[4]*(3.0*(dScale*thole_q0 + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    dPermCoef = -fourThirds*rInvVec[5]*(4.5*(mScale + bVec[3]) + (1.0 + alphaRVec[2])*alphaRVec[5]*X);
+    dUIndCoef = -fourThirds*rInvVec[5]*(9.0*(pScale*dthole_q0 + bVec[3]) + 2.0*(1.0 + alphaRVec[2])*alphaRVec[5]*X);
+    dUInpCoef = -fourThirds*rInvVec[5]*(9.0*(dScale*dthole_q0 + bVec[3]) + 2.0*(1.0 + alphaRVec[2])*alphaRVec[5]*X);
+    Vij[1]  += ePermCoef*qiQJ[4];
+    Vji[4]  += ePermCoef*qiQI[1] + eUIndCoef*qiUindI[0] + eUInpCoef*qiUinpI[0];
+    VijR[1] += dPermCoef*qiQJ[4];
+    VjiR[4] += dPermCoef*qiQI[1] + dUIndCoef*qiUindI[0] + dUInpCoef*qiUinpI[0];
+    Vijp[0] += eUInpCoef*qiQJ[4];
+    Vijd[0] += eUIndCoef*qiQJ[4];
+    // Q-D and Q-Uind terms (m=0)
+    Vij[4]  += -(ePermCoef*qiQJ[1] + eUIndCoef*qiUindJ[0] + eUInpCoef*qiUinpJ[0]);
+    Vji[1]  += -(ePermCoef*qiQI[4]);
+    VijR[4] += -(dPermCoef*qiQJ[1] + dUIndCoef*qiUindJ[0] + dUInpCoef*qiUinpJ[0]);
+    VjiR[1] += -(dPermCoef*qiQI[4]);
+    Vjip[0] += -(eUInpCoef*qiQI[4]);
+    Vjid[0] += -(eUIndCoef*qiQI[4]);
+    // D-Q and Uind-Q terms (m=1)
+    ePermCoef = -sqrtThree*rInvVec[4]*(mScale + bVec[3]);
+    eUIndCoef = -sqrtThree*rInvVec[4]*(pScale*thole_q1 + bVec[3]);
+    eUInpCoef = -sqrtThree*rInvVec[4]*(dScale*thole_q1 + bVec[3]);
+    dPermCoef = fourSqrtOneThird*rInvVec[5]*(1.5*(mScale + bVec[3]) + 0.5*alphaRVec[5]*X);
+    dUIndCoef = fourSqrtOneThird*rInvVec[5]*(3.0*(pScale*dthole_q1 + bVec[3]) + alphaRVec[5]*X);
+    dUInpCoef = fourSqrtOneThird*rInvVec[5]*(3.0*(dScale*dthole_q1 + bVec[3]) + alphaRVec[5]*X);
+    Vij[2]  += ePermCoef*qiQJ[5];
+    Vji[5]   = ePermCoef*qiQI[2] + eUIndCoef*qiUindI[1] + eUInpCoef*qiUinpI[1];
+    VijR[2] += dPermCoef*qiQJ[5];
+    VjiR[5]  = dPermCoef*qiQI[2] + dUIndCoef*qiUindI[1] + dUInpCoef*qiUinpI[1];
+    Vij[3]  += ePermCoef*qiQJ[6];
+    Vji[6]   = ePermCoef*qiQI[3] + eUIndCoef*qiUindI[2] + eUInpCoef*qiUinpI[2];
+    VijR[3] += dPermCoef*qiQJ[6];
+    VjiR[6]  = dPermCoef*qiQI[3] + dUIndCoef*qiUindI[2] + dUInpCoef*qiUinpI[2];
+    Vijp[1] += eUInpCoef*qiQJ[5];
+    Vijd[1] += eUIndCoef*qiQJ[5];
+    Vijp[2] += eUInpCoef*qiQJ[6];
+    Vijd[2] += eUIndCoef*qiQJ[6];
+    // D-Q and Uind-Q terms (m=1)
+    Vij[5]   = -(ePermCoef*qiQJ[2] + eUIndCoef*qiUindJ[1] + eUInpCoef*qiUinpJ[1]);
+    Vji[2]  += -(ePermCoef*qiQI[5]);
+    VijR[5]  = -(dPermCoef*qiQJ[2] + dUIndCoef*qiUindJ[1] + dUInpCoef*qiUinpJ[1]);
+    VjiR[2] += -(dPermCoef*qiQI[5]);
+    Vij[6]   = -(ePermCoef*qiQJ[3] + eUIndCoef*qiUindJ[2] + eUInpCoef*qiUinpJ[2]);
+    Vji[3]  += -(ePermCoef*qiQI[6]);
+    VijR[6]  = -(dPermCoef*qiQJ[3] + dUIndCoef*qiUindJ[2] + dUInpCoef*qiUinpJ[2]);
+    VjiR[3] += -(dPermCoef*qiQI[6]);
+    Vjip[1] += -(eUInpCoef*qiQI[5]);
+    Vjid[1] += -(eUIndCoef*qiQI[5]);
+    Vjip[2] += -(eUInpCoef*qiQI[6]);
+    Vjid[2] += -(eUIndCoef*qiQI[6]);
+    // Q-Q terms (m=0)
+    ePermCoef = rInvVec[5]*(6.0*(mScale + bVec[4]) + fourOverFortyFive*(-3.0 + 10.0*alphaRVec[2])*alphaRVec[5]*X);
+    dPermCoef = -oneNinth*rInvVec[6]*(135.0*(mScale + bVec[4]) + 4.0*(1.0 + 2.0*alphaRVec[2])*alphaRVec[7]*X);
+    Vij[4]  += ePermCoef*qiQJ[4];
+    Vji[4]  += ePermCoef*qiQI[4];
+    VijR[4] += dPermCoef*qiQJ[4];
+    VjiR[4] += dPermCoef*qiQI[4];
+    // Q-Q terms (m=1)
+    ePermCoef = -fourOverFifteen*rInvVec[5]*(15.0*(mScale + bVec[4]) + alphaRVec[5]*X);
+    dPermCoef = rInvVec[6]*(10.0*(mScale + bVec[4]) + fourThirds*alphaRVec[7]*X);
+    Vij[5]  += ePermCoef*qiQJ[5];
+    Vji[5]  += ePermCoef*qiQI[5];
+    VijR[5] += dPermCoef*qiQJ[5];
+    VjiR[5] += dPermCoef*qiQI[5];
+    Vij[6]  += ePermCoef*qiQJ[6];
+    Vji[6]  += ePermCoef*qiQI[6];
+    VijR[6] += dPermCoef*qiQJ[6];
+    VjiR[6] += dPermCoef*qiQI[6];
+    // Q-Q terms (m=2)
+    ePermCoef = rInvVec[5]*(mScale + bVec[4] - fourOverFifteen*alphaRVec[5]*X);
+    dPermCoef = -2.5*(mScale + bVec[4])*rInvVec[6];
+    Vij[7]  = ePermCoef*qiQJ[7];
+    Vji[7]  = ePermCoef*qiQI[7];
+    VijR[7] = dPermCoef*qiQJ[7];
+    VjiR[7] = dPermCoef*qiQI[7];
+    Vij[8]  = ePermCoef*qiQJ[8];
+    Vji[8]  = ePermCoef*qiQI[8];
+    VijR[8] = dPermCoef*qiQJ[8];
+    VjiR[8] = dPermCoef*qiQI[8];
+    // Evaluate the energies, forces and torques due to permanent+induced moments
+    // interacting with just the permanent moments.
+    energy = 0.5*(qiQI[0]*Vij[0] + qiQJ[0]*Vji[0]);
+    RealOpenMM fIZ = qiQI[0]*VijR[0];
+    RealOpenMM fJZ = qiQJ[0]*VjiR[0];
+    RealOpenMM EIX = 0.0, EIY = 0.0, EIZ = 0.0, EJX = 0.0, EJY = 0.0, EJZ = 0.0;
+    for(int i = 1; i < 9; ++i){
+        energy += 0.5*(qiQI[i]*Vij[i] + qiQJ[i]*Vji[i]);
+        fIZ += qiQI[i]*VijR[i];
+        fJZ += qiQJ[i]*VjiR[i];
+        EIX += qiQIX[i]*Vij[i];
+        EIY += qiQIY[i]*Vij[i];
+        EIZ += qiQIZ[i]*Vij[i];
+        EJX += qiQJX[i]*Vji[i];
+        EJY += qiQJY[i]*Vji[i];
+        EJZ += qiQJZ[i]*Vji[i];
+    }
-       ftm2i1       += fdir1 + findmp1;
+    // Define the torque intermediates for the induced dipoles. These are simply the induced dipole torque
-       ftm2i2       += fdir2 + findmp2;
+    // intermediates dotted with the field due to permanent moments only, at each center. We inline the
-       ftm2i3       += fdir3 + findmp3;
+    // induced dipole torque intermediates here, for simplicity. N.B. There are no torques on the dipoles
+    // themselves, so we accumulate the torque intermediates into separate variables to allow them to be
+    // used only in the force calculation.
+    //
+    // The torque about the x axis (needed to obtain the y force on the induced dipoles, below)
+    //    qiUindIx[0] = qiQUindI[2];    qiUindIx[1] = 0;    qiUindIx[2] = -qiQUindI[0]
+    RealOpenMM iEIX = qiUinpI[2]*Vijp[0] + qiUindI[2]*Vijd[0] - qiUinpI[0]*Vijp[2] - qiUindI[0]*Vijd[2];
+    RealOpenMM iEJX = qiUinpJ[2]*Vjip[0] + qiUindJ[2]*Vjid[0] - qiUinpJ[0]*Vjip[2] - qiUindJ[0]*Vjid[2];
+    // The torque about the y axis (needed to obtain the x force on the induced dipoles, below)
+    //    qiUindIy[0] = -qiQUindI[1];   qiUindIy[1] = qiQUindI[0];    qiUindIy[2] = 0
+    RealOpenMM iEIY = qiUinpI[0]*Vijp[1] + qiUindI[0]*Vijd[1] - qiUinpI[1]*Vijp[0] - qiUindI[1]*Vijd[0];
+    RealOpenMM iEJY = qiUinpJ[0]*Vjip[1] + qiUindJ[0]*Vjid[1] - qiUinpJ[1]*Vjip[0] - qiUindJ[1]*Vjid[0];
+    // Add in the induced-induced terms, if needed.
+    if(getPolarizationType() == AmoebaReferenceMultipoleForce::Mutual){
+        // Uind-Uind terms (m=0)
+        RealOpenMM eCoef = -fourThirds*rInvVec[3]*(3.0*(uScale*thole_d0 + bVec[3]) + alphaRVec[3]*X);
+        RealOpenMM dCoef = rInvVec[4]*(6.0*(uScale*dthole_d0 + bVec[3]) + 4.0*alphaRVec[5]*X);
+        iEIX += eCoef*(qiUinpI[2]*qiUindJ[0] + qiUindI[2]*qiUinpJ[0]);
+        iEJX += eCoef*(qiUinpJ[2]*qiUindI[0] + qiUindJ[2]*qiUinpI[0]);
+        iEIY -= eCoef*(qiUinpI[1]*qiUindJ[0] + qiUindI[1]*qiUinpJ[0]);
+        iEJY -= eCoef*(qiUinpJ[1]*qiUindI[0] + qiUindJ[1]*qiUinpI[0]);
+        fIZ += dCoef*(qiUinpI[0]*qiUindJ[0] + qiUindI[0]*qiUinpJ[0]);
+        fIZ += dCoef*(qiUinpJ[0]*qiUindI[0] + qiUindJ[0]*qiUinpI[0]);
+        // Uind-Uind terms (m=1)
+        eCoef = 2.0*rInvVec[3]*(uScale*thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+        dCoef = -3.0*rInvVec[4]*(uScale*dthole_d1 + bVec[3]);
+        iEIX -= eCoef*(qiUinpI[0]*qiUindJ[2] + qiUindI[0]*qiUinpJ[2]);
+        iEJX -= eCoef*(qiUinpJ[0]*qiUindI[2] + qiUindJ[0]*qiUinpI[2]);
+        iEIY += eCoef*(qiUinpI[0]*qiUindJ[1] + qiUindI[0]*qiUinpJ[1]);
+        iEJY += eCoef*(qiUinpJ[0]*qiUindI[1] + qiUindJ[0]*qiUinpI[1]);
+        fIZ += dCoef*(qiUinpI[1]*qiUindJ[1] + qiUindI[1]*qiUinpJ[1] + qiUinpI[2]*qiUindJ[2] + qiUindI[2]*qiUinpJ[2]);
+        fIZ += dCoef*(qiUinpJ[1]*qiUindI[1] + qiUindJ[1]*qiUinpI[1] + qiUinpJ[2]*qiUindI[2] + qiUindJ[2]*qiUinpI[2]);
    }
-    // intermediate variables for induced torque terms
+    // The quasi-internal frame forces and torques.  Note that the induced torque intermediates are
+    // used in the force expression, but not in the torques; the induced dipoles are isotropic.
-    RealOpenMM gti2  = 0.5 * bn2 * (sci4+scip4);
+    RealOpenMM qiForce[3] = {rInv*(EIY+EJY+iEIY+iEJY), -rInv*(EIX+EJX+iEIX+iEJX), -(fJZ+fIZ)};
-    RealOpenMM gti3  = 0.5 * bn2 * (sci3+scip3);
+    RealOpenMM qiTorqueI[3] = {-EIX, -EIY, -EIZ};
-    RealOpenMM gti4  = gfi4;
+    RealOpenMM qiTorqueJ[3] = {-EJX, -EJY, -EJZ};
-    RealOpenMM gti5  = gfi5;
-    RealOpenMM gti6  = gfi6;
-    RealOpenMM gtri2 = 0.5 * rr5 * (sci4*psc5+scip4*dsc5);
-    RealOpenMM gtri3 = 0.5 * rr5 * (sci3*psc5+scip3*dsc5);
-    RealOpenMM gtri4 = gfri4;
-    RealOpenMM gtri5 = gfri5;
-    RealOpenMM gtri6 = gfri6;
-    // get the permanent torque with screening
-    RealOpenMM ttm21 = -bn1*dixdk1 + gf2*dixr1
-        + gf4*(dixqkr1+dkxqir1+rxqidk1-2.0*qixqk1)
-        - gf5*rxqir1 - gf7*(rxqikr1+qkrxqir1);
-    RealOpenMM ttm22 = -bn1*dixdk2 + gf2*dixr2
-        + gf4*(dixqkr2+dkxqir2+rxqidk2-2.0*qixqk2)
-        - gf5*rxqir2 - gf7*(rxqikr2+qkrxqir2);
-    RealOpenMM ttm23 = -bn1*dixdk3 + gf2*dixr3
-        + gf4*(dixqkr3+dkxqir3+rxqidk3-2.0*qixqk3)
-        - gf5*rxqir3 - gf7*(rxqikr3+qkrxqir3);
-    RealOpenMM ttm31 = bn1*dixdk1 + gf3*dkxr1
-        - gf4*(dixqkr1+dkxqir1+rxqkdi1-2.0*qixqk1)
-        - gf6*rxqkr1 - gf7*(rxqkir1-qkrxqir1);
-    RealOpenMM ttm32 = bn1*dixdk2 + gf3*dkxr2
-        - gf4*(dixqkr2+dkxqir2+rxqkdi2-2.0*qixqk2)
-        - gf6*rxqkr2 - gf7*(rxqkir2-qkrxqir2);
-    RealOpenMM ttm33 = bn1*dixdk3 + gf3*dkxr3
-        - gf4*(dixqkr3+dkxqir3+rxqkdi3-2.0*qixqk3)
-        - gf6*rxqkr3 - gf7*(rxqkir3-qkrxqir3);
-    // get the permanent torque without screening
-    RealOpenMM ttm2r1 = -rr3*dixdk1 + gfr2*dixr1-gfr5*rxqir1
-        + gfr4*(dixqkr1+dkxqir1+rxqidk1-2.0*qixqk1)
-        - gfr7*(rxqikr1+qkrxqir1);
-    RealOpenMM ttm2r2 = -rr3*dixdk2 + gfr2*dixr2-gfr5*rxqir2
-        + gfr4*(dixqkr2+dkxqir2+rxqidk2-2.0*qixqk2)
-        - gfr7*(rxqikr2+qkrxqir2);
-    RealOpenMM ttm2r3 = -rr3*dixdk3 + gfr2*dixr3-gfr5*rxqir3
-        + gfr4*(dixqkr3+dkxqir3+rxqidk3-2.0*qixqk3)
-        - gfr7*(rxqikr3+qkrxqir3);
-    RealOpenMM ttm3r1 = rr3*dixdk1 + gfr3*dkxr1 -gfr6*rxqkr1
-        - gfr4*(dixqkr1+dkxqir1+rxqkdi1-2.0*qixqk1)
-        - gfr7*(rxqkir1-qkrxqir1);
-    RealOpenMM ttm3r2 = rr3*dixdk2 + gfr3*dkxr2 -gfr6*rxqkr2
-        - gfr4*(dixqkr2+dkxqir2+rxqkdi2-2.0*qixqk2)
-        - gfr7*(rxqkir2-qkrxqir2);
-    RealOpenMM ttm3r3 = rr3*dixdk3 + gfr3*dkxr3 -gfr6*rxqkr3
-        - gfr4*(dixqkr3+dkxqir3+rxqkdi3-2.0*qixqk3)
-        - gfr7*(rxqkir3-qkrxqir3);
-    // get the induced torque with screening
-    RealOpenMM ttm2i1 = -bn1*(dixuk1+dixukp1)*0.5
-        + gti2*dixr1 + gti4*(ukxqir1+rxqiuk1
-        + ukxqirp1+rxqiukp1)*0.5 - gti5*rxqir1;
-    RealOpenMM ttm2i2 = -bn1*(dixuk2+dixukp2)*0.5
-        + gti2*dixr2 + gti4*(ukxqir2+rxqiuk2
-        + ukxqirp2+rxqiukp2)*0.5 - gti5*rxqir2;
-    RealOpenMM ttm2i3 = -bn1*(dixuk3+dixukp3)*0.5
-        + gti2*dixr3 + gti4*(ukxqir3+rxqiuk3
-        + ukxqirp3+rxqiukp3)*0.5 - gti5*rxqir3;
-    RealOpenMM ttm3i1 = -bn1*(dkxui1+dkxuip1)*0.5
-        + gti3*dkxr1 - gti4*(uixqkr1+rxqkui1
-        + uixqkrp1+rxqkuip1)*0.5 - gti6*rxqkr1;
-    RealOpenMM ttm3i2 = -bn1*(dkxui2+dkxuip2)*0.5
-        + gti3*dkxr2 - gti4*(uixqkr2+rxqkui2
-        + uixqkrp2+rxqkuip2)*0.5 - gti6*rxqkr2;
-    RealOpenMM ttm3i3 = -bn1*(dkxui3+dkxuip3)*0.5
-        + gti3*dkxr3 - gti4*(uixqkr3+rxqkui3
-        + uixqkrp3+rxqkuip3)*0.5 - gti6*rxqkr3;
-    // get the induced torque without screening
-    RealOpenMM ttm2ri1 = -rr3*(dixuk1*psc3+dixukp1*dsc3)*0.5
-        + gtri2*dixr1 + gtri4*((ukxqir1+rxqiuk1)*psc5
-        +(ukxqirp1+rxqiukp1)*dsc5)*0.5 - gtri5*rxqir1;
-    RealOpenMM ttm2ri2 = -rr3*(dixuk2*psc3+dixukp2*dsc3)*0.5
-        + gtri2*dixr2 + gtri4*((ukxqir2+rxqiuk2)*psc5
-        +(ukxqirp2+rxqiukp2)*dsc5)*0.5 - gtri5*rxqir2;
-    RealOpenMM ttm2ri3 = -rr3*(dixuk3*psc3+dixukp3*dsc3)*0.5
-        + gtri2*dixr3 + gtri4*((ukxqir3+rxqiuk3)*psc5
-        +(ukxqirp3+rxqiukp3)*dsc5)*0.5 - gtri5*rxqir3;
-    RealOpenMM ttm3ri1 = -rr3*(dkxui1*psc3+dkxuip1*dsc3)*0.5
-        + gtri3*dkxr1 - gtri4*((uixqkr1+rxqkui1)*psc5
-        +(uixqkrp1+rxqkuip1)*dsc5)*0.5 - gtri6*rxqkr1;
-    RealOpenMM ttm3ri2 = -rr3*(dkxui2*psc3+dkxuip2*dsc3)*0.5
-        + gtri3*dkxr2 - gtri4*((uixqkr2+rxqkui2)*psc5
-        +(uixqkrp2+rxqkuip2)*dsc5)*0.5 - gtri6*rxqkr2;
-    RealOpenMM ttm3ri3 = -rr3*(dkxui3*psc3+dkxuip3*dsc3)*0.5
-        + gtri3*dkxr3 - gtri4*((uixqkr3+rxqkui3)*psc5
-        +(uixqkrp3+rxqkuip3)*dsc5)*0.5 - gtri6*rxqkr3;
-    // handle the case where scaling is used
-    ftm21  = (ftm21-(1.0-scalingFactors[M_SCALE])*ftm2r1);
-    ftm2i1 = (ftm2i1-ftm2ri1);
-    ttm21  = (ttm21-(1.0-scalingFactors[M_SCALE])*ttm2r1);
-    ttm2i1 = (ttm2i1-ttm2ri1);
-    ttm31  = (ttm31-(1.0-scalingFactors[M_SCALE])*ttm3r1);
-    ttm3i1 = (ttm3i1-ttm3ri1);
-    ftm22  = (ftm22-(1.0-scalingFactors[M_SCALE])*ftm2r2);
-    ftm2i2 = (ftm2i2-ftm2ri2);
-    ttm22  = (ttm22-(1.0-scalingFactors[M_SCALE])*ttm2r2);
-    ttm2i2 = (ttm2i2-ttm2ri2);
-    ttm32  = (ttm32-(1.0-scalingFactors[M_SCALE])*ttm3r2);
-    ttm3i2 = (ttm3i2-ttm3ri2);
-    ftm23  = (ftm23-(1.0-scalingFactors[M_SCALE])*ftm2r3);
-    ftm2i3 = (ftm2i3-ftm2ri3);
-    ttm23  = (ttm23-(1.0-scalingFactors[M_SCALE])*ttm2r3);
-    ttm2i3 = (ttm2i3-ttm2ri3);
-    ttm33  = (ttm33-(1.0-scalingFactors[M_SCALE])*ttm3r3);
-    ttm3i3 = (ttm3i3-ttm3ri3);
-    // increment gradient due to force and torque on first site;
-    RealOpenMM conversionFactor  = (_electric/_dielectric);
-    energy                 *= conversionFactor;
-    forces[iIndex][0]      -= (ftm21 + ftm2i1)*conversionFactor;
-    forces[iIndex][1]      -= (ftm22 + ftm2i2)*conversionFactor;
-    forces[iIndex][2]      -= (ftm23 + ftm2i3)*conversionFactor;
-    forces[jIndex][0]      += (ftm21 + ftm2i1)*conversionFactor;
-    forces[jIndex][1]      += (ftm22 + ftm2i2)*conversionFactor;
-    forces[jIndex][2]      += (ftm23 + ftm2i3)*conversionFactor;
-    torques[iIndex][0]     += (ttm21 + ttm2i1)*conversionFactor;
-    torques[iIndex][1]     += (ttm22 + ttm2i2)*conversionFactor;
-    torques[iIndex][2]     += (ttm23 + ttm2i3)*conversionFactor;
-    torques[jIndex][0]     += (ttm31 + ttm3i1)*conversionFactor;
-    torques[jIndex][1]     += (ttm32 + ttm3i2)*conversionFactor;
-    torques[jIndex][2]     += (ttm33 + ttm3i3)*conversionFactor;
+    // Rotate the forces and torques back to the lab frame
+    for (int ii = 0; ii < 3; ii++) {
+        RealOpenMM forceVal = 0.0;
+        RealOpenMM torqueIVal = 0.0;
+        RealOpenMM torqueJVal = 0.0;
+        for (int jj = 0; jj < 3; jj++) {
+            forceVal   += forceRotationMatrix[ii][jj] * qiForce[jj];
+            torqueIVal += forceRotationMatrix[ii][jj] * qiTorqueI[jj];
+            torqueJVal += forceRotationMatrix[ii][jj] * qiTorqueJ[jj];
+        }
+        torques[iIndex][ii] += torqueIVal;
+        torques[jIndex][ii] += torqueJVal;
+        forces[iIndex][ii]  -= forceVal;
+        forces[jIndex][ii]  += forceVal;
+    }
    return energy;
 }
@@ -6616,6 +6508,6 @@ RealOpenMM AmoebaReferencePmeMultipoleForce::calculateElectrostatic(const vector
    energy += computeReciprocalSpaceInducedDipoleForceAndEnergy(getPolarizationType(), particleData, forces, torques);
    energy += computeReciprocalSpaceFixedMultipoleForceAndEnergy(particleData, forces, torques);
    energy += calculatePmeSelfEnergy(particleData);
    return energy;
 }
--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
@@ -37,6 +37,20 @@ typedef std::map< unsigned int, RealOpenMM> MapIntRealOpenMM;
 typedef MapIntRealOpenMM::iterator MapIntRealOpenMMI;
 typedef MapIntRealOpenMM::const_iterator MapIntRealOpenMMCI;
+// A few useful constants for the spherical harmonic multipole code.
+const RealOpenMM oneThird = 1.0/3.0;
+const RealOpenMM twoThirds = 2.0/3.0;
+const RealOpenMM fourThirds = 4.0/3.0;
+const RealOpenMM fourSqrtOneThird = 4.0/sqrt(3.0);
+const RealOpenMM sqrtFourThirds = 2.0/sqrt(3.0);
+const RealOpenMM sqrtOneThird = 1.0/sqrt(3.0);
+const RealOpenMM sqrtThree = sqrt(3.0);
+const RealOpenMM oneNinth = 1.0/9.0;
+const RealOpenMM fourOverFortyFive = 4.0/45.0;
+const RealOpenMM fourOverFifteen = 4.0/15.0;
 /**
 * 2-dimensional int vector
 */
@@ -589,6 +603,8 @@ protected:
            RealOpenMM charge;
            RealVec dipole;
            RealOpenMM quadrupole[6];
+            RealVec sphericalDipole;
+            RealOpenMM sphericalQuadrupole[5];
            RealOpenMM thole;
            RealOpenMM dampingFactor;
            RealOpenMM polarity;
@@ -795,6 +811,41 @@ protected:
                                       const MultipoleParticleData& particleX,
                                             MultipoleParticleData* particleY, int axisType) const;
+    /**
+     * Forms the rotation matrix for the quasi-internal coordinate system,
+     * which is the rotation matrix that describes the orientation of the
+     * internuclear vector for a given pair (I,J) in lab frame.
+     *
+     * @param particleI             particleI position
+     * @param particleJ             particleJ position
+     * @param deltaR                the internuclear vector, corrected for periodic boundary conditions
+     * @param r                     the bond length between atoms I and J
+     * @param rotationmatrix        the output rotation matrix for a 3-vector
+     */
+    void formQIRotationMatrix(const RealVec& iPosition,
+                              const RealVec& jPosition,
+                              const RealVec &deltaR,
+                              RealOpenMM r,
+                              RealOpenMM (&rotationMatrix)[3][3]) const;
+    /**
+     * Constructs a rotation matrix for spherical harmonic quadrupoles, using the dipole rotation matrix.
+     *
+     * @param D1                    The input spherical harmonic dipole rotation matrix
+     * @param D2                    The output spherical harmonic quadrupole rotation matrix
+     */
+     void buildSphericalQuadrupoleRotationMatrix(const RealOpenMM (&D1)[3][3], RealOpenMM (&D2)[5][5]) const;
+     /**
+      * Constructs a rotation matrix for spherical harmonic quadrupoles, using the dipole rotation matrix.
+      * Only the m={0,1c,1s} terms are constructed; these are the only terms needed to evaluate the field.
+      *
+      * @param D1                    The input spherical harmonic dipole rotation matrix
+      * @param D2                    The output spherical harmonic quadrupole rotation matrix
+      */
+      void buildPartialSphericalQuadrupoleRotationMatrix(const RealOpenMM (&D1)[3][3], RealOpenMM (&D2)[3][5]) const;
    /**
     * Apply rotation matrix to molecular dipole/quadrupoles to get corresponding lab frame values.
     * 

--- a/plugins/amoeba/platforms/reference/tests/CMakeLists.txt
+++ b/plugins/amoeba/platforms/reference/tests/CMakeLists.txt
@@ -12,7 +12,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_AMOEBA_TARGET} ${SHARED_TARGET})
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/amoeba/platforms/reference/tests/TestReferenceAmoebaGeneralizedKirkwoodForce.cpp
+++ b/plugins/amoeba/platforms/reference/tests/TestReferenceAmoebaGeneralizedKirkwoodForce.cpp
@@ -6983,7 +6983,7 @@ static void compareForcesEnergy(std::string& testName, double expectedEnergy, do
                                const std::vector<Vec3>& forces, double tolerance) {
    for (unsigned int ii = 0; ii < forces.size(); ii++) {
-        ASSERT_EQUAL_VEC_MOD(expectedForces[ii], forces[ii], tolerance, testName);
+        ASSERT_EQUAL_VEC(expectedForces[ii], forces[ii], tolerance);
    }
    ASSERT_EQUAL_TOL_MOD(expectedEnergy, energy, tolerance, testName);
 }

--- a/plugins/amoeba/serialization/tests/CMakeLists.txt
+++ b/plugins/amoeba/serialization/tests/CMakeLists.txt
@@ -12,7 +12,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # All tests use shared libraries
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_AMOEBA_TARGET})
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/cpupme/CMakeLists.txt
+++ b/plugins/cpupme/CMakeLists.txt
@@ -75,7 +75,7 @@ IF (OPENMM_BUILD_SHARED_LIB)
    IF (FFTW_THREADS_LIBRARY)
        TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${FFTW_THREADS_LIBRARY})
    ENDIF (FFTW_THREADS_LIBRARY)
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_PME_BUILDING_SHARED_LIBRARY")
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_PME_BUILDING_SHARED_LIBRARY")
    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
 ENDIF (OPENMM_BUILD_SHARED_LIB)
@@ -88,7 +88,7 @@ IF(OPENMM_BUILD_STATIC_LIB)
    IF (FFTW_THREADS_LIBRARY)
        TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${FFTW_THREADS_LIBRARY})
    ENDIF (FFTW_THREADS_LIBRARY)
-    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_PME_BUILDING_STATIC_LIBRARY")
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_PME_BUILDING_STATIC_LIBRARY")
    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
 ENDIF(OPENMM_BUILD_STATIC_LIB)

--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -335,21 +335,19 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
    }
 }
-class CpuCalcPmeReciprocalForceKernel::ThreadData {
+class CpuCalcPmeReciprocalForceKernel::ComputeTask : public ThreadPool::Task {
 public:
-    CpuCalcPmeReciprocalForceKernel& owner;
+    ComputeTask(CpuCalcPmeReciprocalForceKernel& owner) : owner(owner) {
-    int index;
+    }
-    float* tempGrid;
+    void execute(ThreadPool& threads, int threadIndex) {
-    ThreadData(CpuCalcPmeReciprocalForceKernel& owner, int index) : owner(owner), index(index), tempGrid(NULL) {
+        owner.runWorkerThread(threads, threadIndex);
    }
+    CpuCalcPmeReciprocalForceKernel& owner;
 };
 static void* threadBody(void* args) {
-    CpuCalcPmeReciprocalForceKernel::ThreadData& data = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel::ThreadData*>(args);
+    CpuCalcPmeReciprocalForceKernel& owner = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel*>(args);
-    data.owner.runThread(data.index);
+    owner.runMainThread();
-    if (data.tempGrid != NULL)
-        fftwf_free(data.tempGrid);
-    delete &data;
    return 0;
 }
@@ -362,6 +360,7 @@ void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
+    threadEnergy.resize(numThreads);
    gridx = findFFTDimension(xsize, false);
    gridy = findFFTDimension(ysize, false);
    gridz = findFFTDimension(zsize, true);
@@ -372,23 +371,24 @@ void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize
    // Initialize threads.
+    isFinished = false;
    pthread_cond_init(&startCondition, NULL);
    pthread_cond_init(&endCondition, NULL);
-    pthread_cond_init(&mainThreadStartCondition, NULL);
-    pthread_cond_init(&mainThreadEndCondition, NULL);
    pthread_mutex_init(&lock, NULL);
-    thread.resize(numThreads);
+    pthread_create(&mainThread, NULL, threadBody, this);
-    for (int i = 0; i < numThreads; i++) {
-        ThreadData* data = new ThreadData(*this, i);
+    // Wait until the main thread is up and running.
-        threadData.push_back(data);
-        pthread_create(&thread[i], NULL, threadBody, data);
+    pthread_mutex_lock(&lock);
-        data->tempGrid = (float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3));
+    while (!isFinished)
-    }
+        pthread_cond_wait(&endCondition, &lock);
-    pthread_create(&mainThread, NULL, threadBody, new ThreadData(*this, -1));
+    pthread_mutex_unlock(&lock);
    // Initialize FFTW.
-    realGrid = threadData[0]->tempGrid;
+    for (int i = 0; i < numThreads; i++)
+        tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
+    realGrid = tempGrid[0];
    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
    fftwf_plan_with_nthreads(numThreads);
    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
@@ -455,16 +455,13 @@ CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
    isDeleted = true;
    pthread_mutex_lock(&lock);
    pthread_cond_broadcast(&startCondition);
-    pthread_cond_broadcast(&mainThreadStartCondition);
    pthread_mutex_unlock(&lock);
-    for (int i = 0; i < (int) thread.size(); i++)
-        pthread_join(thread[i], NULL);
    pthread_join(mainThread, NULL);
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
-    pthread_cond_destroy(&mainThreadStartCondition);
+    for (int i = 0; i < (int) tempGrid.size(); i++)
-    pthread_cond_destroy(&mainThreadEndCondition);
+        fftwf_free(tempGrid[i]);
    if (complexGrid != NULL)
        fftwf_free(complexGrid);
    if (hasCreatedPlan) {
@@ -473,92 +470,79 @@ CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
    }
 }
-void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
+void CpuCalcPmeReciprocalForceKernel::runMainThread() {
-    if (index == -1) {
+    // This is the main thread that coordinates all the other ones.
-        // This is the main thread that coordinates all the other ones.
-        pthread_mutex_lock(&lock);
-        while (true) {
-            // Wait for the signal to start.
-            pthread_cond_wait(&mainThreadStartCondition, &lock);
-            if (isDeleted)
-                break;
-            posq = io->getPosq();
-            advanceThreads(); // Signal threads to perform charge spreading.
-            advanceThreads(); // Signal threads to sum the charge grids.
-            fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
-            if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2])
-                advanceThreads(); // Signal threads to compute the reciprocal scale factors.
-            if (includeEnergy)
-                advanceThreads(); // Signal threads to compute energy.
-            advanceThreads(); // Signal threads to perform reciprocal convolution.
-            fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
-            advanceThreads(); // Signal threads to interpolate forces.
-            isFinished = true;
-            lastBoxVectors[0] = periodicBoxVectors[0];
-            lastBoxVectors[1] = periodicBoxVectors[1];
-            lastBoxVectors[2] = periodicBoxVectors[2];
-            pthread_cond_signal(&mainThreadEndCondition);
-        }
-        pthread_mutex_unlock(&lock);
-    }
-    else {
-        // This is a worker thread.
-        int particleStart = (index*numParticles)/numThreads;
-        int particleEnd = ((index+1)*numParticles)/numThreads;
-        int gridxStart = (index*gridx)/numThreads;
-        int gridxEnd = ((index+1)*gridx)/numThreads;
-        int gridSize = (gridx*gridy*gridz+3)/4;
-        int gridStart = 4*((index*gridSize)/numThreads);
-        int gridEnd = 4*(((index+1)*gridSize)/numThreads);
-        while (true) {
-            threadWait();
-            if (isDeleted)
-                break;
-            spreadCharge(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
-            threadWait();
-            int numGrids = threadData.size();
-            for (int i = gridStart; i < gridEnd; i += 4) {
-                fvec4 sum(&realGrid[i]);
-                for (int j = 1; j < numGrids; j++)
-                    sum += fvec4(&threadData[j]->tempGrid[i]);
-                sum.store(&realGrid[i]);
-            }
-            threadWait();
-            if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
-                computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
-                threadWait();
-            }
-            if (includeEnergy) {
-                double threadEnergy = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
-                pthread_mutex_lock(&lock);
-                energy += threadEnergy;
-                pthread_mutex_unlock(&lock);
-                threadWait();
-            }
-            reciprocalConvolution(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, recipEterm);
-            threadWait();
-            interpolateForces(particleStart, particleEnd, posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
-        }
-    }
-}
-void CpuCalcPmeReciprocalForceKernel::threadWait() {
    pthread_mutex_lock(&lock);
-    waitCount++;
+    isFinished = true;
    pthread_cond_signal(&endCondition);
-    pthread_cond_wait(&startCondition, &lock);
+    ThreadPool threads(numThreads);
+    while (true) {
+        // Wait for the signal to start.
+        pthread_cond_wait(&startCondition, &lock);
+        if (isDeleted)
+            break;
+        posq = io->getPosq();
+        ComputeTask task(*this);
+        threads.execute(task); // Signal threads to perform charge spreading.
+        threads.waitForThreads();
+        threads.resumeThreads(); // Signal threads to sum the charge grids.
+        threads.waitForThreads();
+        fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
+        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
+            threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
+            threads.waitForThreads();
+        }
+        if (includeEnergy) {
+            threads.resumeThreads(); // Signal threads to compute energy.
+            threads.waitForThreads();
+            for (int i = 0; i < (int) threadEnergy.size(); i++)
+                energy += threadEnergy[i];
+        }
+        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
+        threads.waitForThreads();
+        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
+        threads.resumeThreads(); // Signal threads to interpolate forces.
+        threads.waitForThreads();
+        isFinished = true;
+        lastBoxVectors[0] = periodicBoxVectors[0];
+        lastBoxVectors[1] = periodicBoxVectors[1];
+        lastBoxVectors[2] = periodicBoxVectors[2];
+        pthread_cond_signal(&endCondition);
+    }
    pthread_mutex_unlock(&lock);
 }
-void CpuCalcPmeReciprocalForceKernel::advanceThreads() {
+void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) {
-    waitCount = 0;
+    int particleStart = (index*numParticles)/numThreads;
-    pthread_cond_broadcast(&startCondition);
+    int particleEnd = ((index+1)*numParticles)/numThreads;
-    while (waitCount < numThreads) {
+    int gridxStart = (index*gridx)/numThreads;
-        pthread_cond_wait(&endCondition, &lock);
+    int gridxEnd = ((index+1)*gridx)/numThreads;
+    int gridSize = (gridx*gridy*gridz+3)/4;
+    int gridStart = 4*((index*gridSize)/numThreads);
+    int gridEnd = 4*(((index+1)*gridSize)/numThreads);
+    spreadCharge(particleStart, particleEnd, posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
+    threads.syncThreads();
+    int numGrids = tempGrid.size();
+    for (int i = gridStart; i < gridEnd; i += 4) {
+        fvec4 sum(&realGrid[i]);
+        for (int j = 1; j < numGrids; j++)
+            sum += fvec4(&tempGrid[j][i]);
+        sum.store(&realGrid[i]);
    }
+    threads.syncThreads();
+    if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
+        computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
+        threads.syncThreads();
+    }
+    if (includeEnergy) {
+        threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
+        threads.syncThreads();
+    }
+    reciprocalConvolution(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, recipEterm);
+    threads.syncThreads();
+    interpolateForces(particleStart, particleEnd, posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
 }
 void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
@@ -581,14 +565,14 @@ void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* perio
    pthread_mutex_lock(&lock);
    isFinished = false;
-    pthread_cond_signal(&mainThreadStartCondition);
+    pthread_cond_signal(&startCondition);
    pthread_mutex_unlock(&lock);
 }
 double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
    pthread_mutex_lock(&lock);
    while (!isFinished) {
-        pthread_cond_wait(&mainThreadEndCondition, &lock);
+        pthread_cond_wait(&endCondition, &lock);
    }
    pthread_mutex_unlock(&lock);
    io.setForce(&force[0]);

--- a/plugins/cpupme/src/CpuPmeKernels.h
+++ b/plugins/cpupme/src/CpuPmeKernels.h
@@ -36,6 +36,7 @@
 #include "internal/windowsExportPme.h"
 #include "openmm/kernels.h"
 #include "openmm/Vec3.h"
+#include "openmm/internal/ThreadPool.h"
 #include <fftw3.h>
 #include <pthread.h>
 #include <vector>
@@ -49,7 +50,6 @@ namespace OpenMM {
 class OPENMM_EXPORT_PME CpuCalcPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
 public:
-    class ThreadData;
    CpuCalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL) {
    }
@@ -80,22 +80,19 @@ public:
     */
    double finishComputation(IO& io);
    /**
-     * This routine contains the code executed by each thread.
+     * This routine contains the code executed by the main thread.
     */
-    void runThread(int index);
+    void runMainThread();
+    /**
+     * This routine contains the code executed by each worker thread.
+     */
+    void runWorkerThread(ThreadPool& threads, int index);
    /**
     * Get whether the current CPU supports all features needed by this kernel.
     */
    static bool isProcessorSupported();
 private:
-    /**
+    class ComputeTask;
-     * This is called by the worker threads to wait until the master thread instructs them to advance.
-     */
-    void threadWait();
-    /**
-     * This is called by the master thread to instruct all the worker threads to advance.
-     */
-    void advanceThreads();
    /**
     * Select a size for one grid dimension that FFTW can handle efficiently.
     */
@@ -109,16 +106,15 @@ private:
    std::vector<float> bsplineModuli[3];
    std::vector<float> recipEterm;
    Vec3 lastBoxVectors[3];
+    std::vector<float> threadEnergy;
+    std::vector<float*> tempGrid;
    float* realGrid;
    fftwf_complex* complexGrid;
    fftwf_plan forwardFFT, backwardFFT;
    int waitCount;
    pthread_cond_t startCondition, endCondition;
-    pthread_cond_t mainThreadStartCondition, mainThreadEndCondition;
    pthread_mutex_t lock;
    pthread_t mainThread;
-    std::vector<pthread_t> thread;
-    std::vector<ThreadData*> threadData;
    // The following variables are used to store information about the calculation currently being performed.
    IO* io;
    float energy;

--- a/plugins/cpupme/tests/CMakeLists.txt
+++ b/plugins/cpupme/tests/CMakeLists.txt
@@ -13,6 +13,6 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    ELSE (OPENMM_BUILD_SHARED_LIB)
        TARGET_LINK_LIBRARIES(${TEST_ROOT} ${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}_static)
    ENDIF (OPENMM_BUILD_SHARED_LIB)
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/cudacompiler/CMakeLists.txt
+++ b/plugins/cudacompiler/CMakeLists.txt
@@ -74,7 +74,7 @@ IF (OPENMM_BUILD_SHARED_LIB)
    IF (APPLE)
        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
    ELSE (APPLE)
-        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
    ENDIF (APPLE)
    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
@@ -91,7 +91,7 @@ IF(OPENMM_BUILD_STATIC_LIB)
    IF (APPLE)
        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
    ELSE (APPLE)
-        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
    ENDIF (APPLE)
    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})

--- a/plugins/cudacompiler/tests/CMakeLists.txt
+++ b/plugins/cudacompiler/tests/CMakeLists.txt
@@ -17,7 +17,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    IF (APPLE)
        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ELSE (APPLE)
-        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ENDIF (APPLE)
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})

--- a/plugins/drude/CMakeLists.txt
+++ b/plugins/drude/CMakeLists.txt
@@ -67,14 +67,14 @@ ENDFOREACH(subdir)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
 ADD_LIBRARY(${SHARED_DRUDE_TARGET} SHARED ${SOURCE_DRUDE_FILES} ${SOURCE_DRUDE_INCLUDE_FILES} ${API_DRUDE_ABS_INCLUDE_FILES})
-SET_TARGET_PROPERTIES(${SHARED_DRUDE_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_DRUDE_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_DRUDE_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_DRUDE_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY")
 FILE(GLOB serialization_files  ${CMAKE_CUURENT_SOURCE_DIR}/serialization/src/*.cpp)
 SET_SOURCE_FILES_PROPERTIES(${serialization_files} PROPERTIES COMPILE_FLAGS "-DOPENMM_DRUDE_BUILDING_SHARED_LIBRARY -DTIXML_USE_STL")
 IF(OPENMM_BUILD_STATIC_LIB)
  ADD_LIBRARY(${STATIC_DRUDE_TARGET} STATIC ${SOURCE_DRUDE_FILES} ${SOURCE_DRUDE_INCLUDE_FILES} ${API_DRUDE_ABS_INCLUDE_FILES})
-  SET_TARGET_PROPERTIES(${STATIC_DRUDE_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_DRUDE_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY")
+  SET_TARGET_PROPERTIES(${STATIC_DRUDE_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_DRUDE_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY")
 ENDIF(OPENMM_BUILD_STATIC_LIB)
 # ----------------------------------------------------------------------------

--- a/plugins/drude/platforms/cuda/CMakeLists.txt
+++ b/plugins/drude/platforms/cuda/CMakeLists.txt
@@ -93,7 +93,7 @@ SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE
 IF (APPLE)
    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
 ELSE (APPLE)
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
 ENDIF (APPLE)
 INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)

--- a/plugins/drude/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/drude/platforms/cuda/tests/CMakeLists.txt
@@ -17,7 +17,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    IF (APPLE)
        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ELSE (APPLE)
-        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ENDIF (APPLE)
    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
    IF (OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)

--- a/plugins/drude/platforms/opencl/CMakeLists.txt
+++ b/plugins/drude/platforms/opencl/CMakeLists.txt
@@ -89,7 +89,7 @@ ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${AP
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}  ${OPENCL_LIBRARIES} ${PTHREADS_LIB})
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}OpenCL)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${SHARED_DRUDE_TARGET})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY")
 INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)
 # Ensure that links to the main OpenCL library will be resolved.

--- a/plugins/drude/platforms/opencl/tests/CMakeLists.txt
+++ b/plugins/drude/platforms/opencl/tests/CMakeLists.txt
@@ -14,7 +14,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_DRUDE_TARGET} ${SHARED_TARGET})
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
    IF (OPENMM_BUILD_OPENCL_DOUBLE_PRECISION_TESTS)
        ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)