Python 2/3 compatibility in single code base, plus python 3 testing on travis.

b7088b74 · peastman · Robert McGibbon · 4c00b312 · b7088b74 · b7088b74
Commit b7088b74 authored Aug 10, 2015 by peastman Committed by Robert McGibbon Aug 27, 2015
20 changed files
--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -220,15 +220,17 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
                params.push_back(mm_float2(dist13, dist12));
            }
            else
-                throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
+                continue; // We can't handle this with SETTLE
            isShakeAtom[atom1] = true;
            isShakeAtom[atom2] = true;
            isShakeAtom[atom3] = true;
        }
-        settleAtoms = OpenCLArray::create<mm_int4>(context, atoms.size(), "settleAtoms");
+        if (atoms.size() > 0) {
-        settleParams = OpenCLArray::create<mm_float2>(context, params.size(), "settleParams");
+            settleAtoms = OpenCLArray::create<mm_int4>(context, atoms.size(), "settleAtoms");
-        settleAtoms->upload(atoms);
+            settleParams = OpenCLArray::create<mm_float2>(context, params.size(), "settleParams");
-        settleParams->upload(params);
+            settleAtoms->upload(atoms);
+            settleParams->upload(params);
+        }
    }
    // Find clusters consisting of a central atom with up to three peripheral atoms.

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -186,7 +186,7 @@ static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) {
 void OpenCLNonbondedUtilities::initialize(const System& system) {
    if (atomExclusions.size() == 0) {
        // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
        atomExclusions.resize(context.getNumAtoms());
        for (int i = 0; i < (int) atomExclusions.size(); i++)
            atomExclusions[i].push_back(i);
@@ -199,7 +199,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
    // Build a list of tiles that contain exclusions.
    set<pair<int, int> > tilesWithExclusions;
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/OpenCLContext::TileSize;
@@ -341,14 +341,19 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
-    setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
+    bool rebuild = false;
-    context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
+    do {
-    blockSorter->sort(*sortedBlocks);
+        setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
-    kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
+        context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
-    context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
+        blockSorter->sort(*sortedBlocks);
-    setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
+        kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
-    context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
+        context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
-    forceRebuildNeighborList = false;
+        setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
+        context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
+        forceRebuildNeighborList = false;
+        if (context.getComputeForceCount() == 1)
+            rebuild = updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
+    } while (rebuild);
    lastCutoff = kernels.cutoffDistance;
 }
@@ -360,18 +365,16 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups) {
        if (useCutoff)
            setPeriodicBoxArgs(context, kernels.forceKernel, 9);
        context.executeKernel(kernels.forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
-        if (context.getComputeForceCount() == 1)
-            updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
    }
 }
-void OpenCLNonbondedUtilities::updateNeighborListSize() {
+bool OpenCLNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
-        return;
+        return false;
    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
    interactionCount->download(pinnedInteractionCount);
    if (pinnedInteractionCount[0] <= (unsigned int) interactingTiles->getSize())
-        return;
+        return false;
    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.
@@ -395,6 +398,7 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
        iter->second.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
    }
    forceRebuildNeighborList = true;
+    return true;
 }
 void OpenCLNonbondedUtilities::setUsePadding(bool padding) {
@@ -410,7 +414,7 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
    numTiles = (int) (endFraction*totalTiles)-startTileIndex;
    if (useCutoff) {
        // We are using a cutoff, and the kernels have already been created.
        for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) {
            iter->second.forceKernel.setArg<cl_uint>(5, startTileIndex);
            iter->second.forceKernel.setArg<cl_uint>(6, numTiles);
@@ -491,7 +495,7 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList->getDeviceBuffer());
            if (kernels.findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
                // The device can't handle this block size, so reduce it.
                groupSize -= 32;
                if (groupSize < 32)
                    throw OpenMMException("Failed to create findInteractingBlocks kernel");

--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -25,7 +25,7 @@ __kernel void computeNonbonded(
        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
-        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
 #endif
@@ -38,7 +38,7 @@ __kernel void computeNonbonded(
    __local AtomData localData[FORCE_WORK_GROUP_SIZE];
    // First loop: process tiles that contain exclusions.
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
@@ -100,7 +100,7 @@ __kernel void computeNonbonded(
        }
        else {
            // This is an off-diagonal tile.
            const unsigned int localAtomIndex = get_local_id(0);
            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
@@ -126,7 +126,7 @@ __kernel void computeNonbonded(
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
+                if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                    real invR = RSQRT(r2);
                    real r = r2*invR;
@@ -213,7 +213,7 @@ __kernel void computeNonbonded(
        bool includeTile = true;
        // Extract the coordinates of this tile.
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
@@ -245,7 +245,7 @@ __kernel void computeNonbonded(
                }
                else
                    skipTiles[get_local_id(0)] = end;
-                skipBase += TILE_SIZE;            
+                skipBase += TILE_SIZE;
                currentSkipIndex = tbx;
                SYNC_WARPS;
            }
@@ -300,7 +300,7 @@ __kernel void computeNonbonded(
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -352,7 +352,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -22,7 +22,7 @@ __kernel void computeNonbonded(
        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
-        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
 #endif
@@ -31,7 +31,7 @@ __kernel void computeNonbonded(
    __local AtomData localData[TILE_SIZE];
    // First loop: process tiles that contain exclusions.
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
@@ -70,7 +70,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -138,7 +138,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -228,9 +228,9 @@ __kernel void computeNonbonded(
    while (pos < end) {
        const bool hasExclusions = false;
        bool includeTile = true;
        // Extract the coordinates of this tile.
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
@@ -304,7 +304,7 @@ __kernel void computeNonbonded(
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
                        real r2 = dot(delta.xyz, delta.xyz);
-                        if (r2 < CUTOFF_SQUARED) {
+                        if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
                            real invR = RSQRT(r2);
                            real r = r2*invR;
                            unsigned int atom2 = j;
@@ -367,7 +367,7 @@ __kernel void computeNonbonded(
 #endif
                        real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
+                        if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                            real invR = RSQRT(r2);
                            real r = r2*invR;

--- a/platforms/opencl/staticTarget/CMakeLists.txt
+++ b/platforms/opencl/staticTarget/CMakeLists.txt
@@ -15,6 +15,6 @@ ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${AP
 TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}  ${OPENCL_LIBRARIES} ${PTHREADS_LIB_STATIC})
 #-DPTW32_STATIC_LIB only works for the windows pthreads.
-SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_OPENCL_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_OPENCL_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/platforms/opencl/tests/CMakeLists.txt
+++ b/platforms/opencl/tests/CMakeLists.txt
@@ -25,7 +25,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
    IF (OPENMM_BUILD_OPENCL_DOUBLE_PRECISION_TESTS)

--- a/platforms/reference/src/SimTKReference/ReferenceConstraints.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceConstraints.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -98,14 +98,13 @@ ReferenceConstraints::ReferenceConstraints(const System& system) : ccma(NULL), s
    // Record the SETTLE clusters.
    vector<bool> isSettleAtom(numParticles, false);
-    int numSETTLE = settleClusters.size();
+    if (settleClusters.size() > 0) {
-    if (numSETTLE > 0) {
+        vector<int> atom1;
-        vector<int> atom1(numSETTLE);
+        vector<int> atom2;
-        vector<int> atom2(numSETTLE);
+        vector<int> atom3;
-        vector<int> atom3(numSETTLE);
+        vector<RealOpenMM> distance1;
-        vector<RealOpenMM> distance1(numSETTLE);
+        vector<RealOpenMM> distance2;
-        vector<RealOpenMM> distance2(numSETTLE);
+        for (int i = 0; i < settleClusters.size(); i++) {
-        for (int i = 0; i < numSETTLE; i++) {
            int p1 = settleClusters[i];
            int p2 = settleConstraints[p1].begin()->first;
            int p3 = (++settleConstraints[p1].begin())->first;
@@ -114,35 +113,36 @@ ReferenceConstraints::ReferenceConstraints(const System& system) : ccma(NULL), s
            float dist23 = settleConstraints[p2].find(p3)->second;
            if (dist12 == dist13) {
                // p1 is the central atom
-                atom1[i] = p1;
+                atom1.push_back(p1);
-                atom2[i] = p2;
+                atom2.push_back(p2);
-                atom3[i] = p3;
+                atom3.push_back(p3);
-                distance1[i] = dist12;
+                distance1.push_back(dist12);
-                distance2[i] = dist23;
+                distance2.push_back(dist23);
            }
            else if (dist12 == dist23) {
                // p2 is the central atom
-                atom1[i] = p2;
+                atom1.push_back(p2);
-                atom2[i] = p1;
+                atom2.push_back(p1);
-                atom3[i] = p3;
+                atom3.push_back(p3);
-                distance1[i] = dist12;
+                distance1.push_back(dist12);
-                distance2[i] = dist13;
+                distance2.push_back(dist13);
            }
            else if (dist13 == dist23) {
                // p3 is the central atom
-                atom1[i] = p3;
+                atom1.push_back(p3);
-                atom2[i] = p1;
+                atom2.push_back(p1);
-                atom3[i] = p2;
+                atom3.push_back(p2);
-                distance1[i] = dist13;
+                distance1.push_back(dist13);
-                distance2[i] = dist12;
+                distance2.push_back(dist12);
            }
            else
-                throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
+                continue; // We can't handle this with SETTLE
            isSettleAtom[p1] = true;
            isSettleAtom[p2] = true;
            isSettleAtom[p3] = true;
        }
-        settle = new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, masses);
+        if (atom1.size() > 0)
+            settle = new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, masses);
    }
    // All other constraints are handled with CCMA.

--- a/platforms/reference/tests/CMakeLists.txt
+++ b/platforms/reference/tests/CMakeLists.txt
@@ -15,7 +15,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    ELSE (OPENMM_BUILD_SHARED_LIB)
        TARGET_LINK_LIBRARIES(${TEST_ROOT} ${STATIC_TARGET})
    ENDIF (OPENMM_BUILD_SHARED_LIB)
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
 ENDFOREACH(TEST_PROG ${TEST_PROGS})

--- a/plugins/amoeba/CMakeLists.txt
+++ b/plugins/amoeba/CMakeLists.txt
@@ -86,14 +86,14 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
 ADD_LIBRARY(${SHARED_AMOEBA_TARGET} SHARED ${SOURCE_AMOEBA_FILES} ${SOURCE_AMOEBA_INCLUDE_FILES} ${API_AMOEBA_ABS_INCLUDE_FILES})
-SET_TARGET_PROPERTIES(${SHARED_AMOEBA_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_AMOEBA_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_AMOEBA_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_AMOEBA_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY")
 FILE(GLOB serialization_files  ${CMAKE_CUURENT_SOURCE_DIR}/serialization/src/*.cpp)
 SET_SOURCE_FILES_PROPERTIES(${serialization_files} PROPERTIES COMPILE_FLAGS "-DOPENMM_AMOEBA_BUILDING_SHARED_LIBRARY -DTIXML_USE_STL")
 IF(OPENMM_BUILD_STATIC_LIB)
  ADD_LIBRARY(${STATIC_AMOEBA_TARGET} STATIC ${SOURCE_AMOEBA_FILES} ${SOURCE_AMOEBA_INCLUDE_FILES} ${API_AMOEBA_ABS_INCLUDE_FILES})
-  SET_TARGET_PROPERTIES(${STATIC_AMOEBA_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_AMOEBA_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY")
+  SET_TARGET_PROPERTIES(${STATIC_AMOEBA_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_AMOEBA_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY")
 ENDIF(OPENMM_BUILD_STATIC_LIB)
 IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)

--- a/plugins/amoeba/platforms/cuda/CMakeLists.txt
+++ b/plugins/amoeba/platforms/cuda/CMakeLists.txt
@@ -94,7 +94,7 @@ SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE
 IF (APPLE)
    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
 ELSE (APPLE)
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
 ENDIF (APPLE)
 INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -806,8 +806,8 @@ private:
 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false),
-        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), fracDipoles(NULL),
+        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL),
-        fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
+        fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
        diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
        prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
@@ -826,6 +826,10 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete labFrameDipoles;
    if (labFrameQuadrupoles != NULL)
        delete labFrameQuadrupoles;
+    if (sphericalDipoles != NULL)
+        delete sphericalDipoles;
+    if (sphericalQuadrupoles != NULL)
+        delete sphericalQuadrupoles;
    if (fracDipoles != NULL)
        delete fracDipoles;
    if (fracQuadrupoles != NULL)
@@ -985,6 +989,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
    labFrameDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles");
    labFrameQuadrupoles = new CudaArray(cu, 5*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
+    sphericalDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "sphericalDipoles");
+    sphericalQuadrupoles = new CudaArray(cu, 5*paddedNumAtoms, elementSize, "sphericalQuadrupoles");
    fracDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "fracDipoles");
    fracQuadrupoles = new CudaArray(cu, 6*paddedNumAtoms, elementSize, "fracQuadrupoles");
    field = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "field");
@@ -1144,6 +1150,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    if (maxInducedIterations > 0) {
        defines["THREAD_BLOCK_SIZE"] = cu.intToString(inducedFieldThreads);
        defines["MAX_PREV_DIIS_DIPOLES"] = cu.intToString(MaxPrevDIISDipoles);
+        defines["USE_MUTUAL_POLARIZATION"] = "1";
        module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
        computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
@@ -1151,33 +1158,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
    }
    stringstream electrostaticsSource;
-    if (usePME) {
+    electrostaticsSource << CudaKernelSources::vectorOps;
-        electrostaticsSource << CudaKernelSources::vectorOps;
+    electrostaticsSource << CudaAmoebaKernelSources::sphericalMultipoles;
+    if (usePME)
        electrostaticsSource << CudaAmoebaKernelSources::pmeMultipoleElectrostatics;
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::pmeElectrostaticPairForce : CudaAmoebaKernelSources::pmeElectrostaticPairForceNoQuadrupoles);
+    else
-        electrostaticsSource << "#define APPLY_SCALE\n";
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::pmeElectrostaticPairForce : CudaAmoebaKernelSources::pmeElectrostaticPairForceNoQuadrupoles);
-        electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
-        if (!useShuffle)
-            electrostaticsThreadMemory += 3*elementSize;
-    }
-    else {
-        electrostaticsSource << CudaKernelSources::vectorOps;
        electrostaticsSource << CudaAmoebaKernelSources::multipoleElectrostatics;
-        electrostaticsSource << "#define F1\n";
+    electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
-        electrostaticsSource << "#undef F1\n";
-        electrostaticsSource << "#define T1\n";
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
-        electrostaticsSource << "#undef T1\n";
-        electrostaticsSource << "#define T3\n";
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
-        electrostaticsThreadMemory = 21*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
-        if (!useShuffle)
-            electrostaticsThreadMemory += 3*elementSize;
-        if (gk != NULL)
-            electrostaticsThreadMemory += 4*elementSize;
-    }
    electrostaticsThreads = min(maxThreads, cu.computeThreadBlockSize(electrostaticsThreadMemory));
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(electrostaticsThreads);
    module = cu.createModule(electrostaticsSource.str(), defines);
@@ -1433,7 +1420,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
    void* computeMomentsArgs[] = {&cu.getPosq().getDevicePointer(), &multipoleParticles->getDevicePointer(),
        &molecularDipoles->getDevicePointer(), &molecularQuadrupoles->getDevicePointer(),
-        &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer()};
+        &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
+        &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer()};
    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getNumAtoms());
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
@@ -1497,8 +1485,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
            &cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(),
            &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
-            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
+            &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
-            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
        if (gkKernel != NULL)
            gkKernel->finishComputation(*torque, *labFrameDipoles, *labFrameQuadrupoles, *inducedDipole, *inducedDipolePolar, *dampingAndThole, *covalentFlags, *polarizationGroupFlags);
@@ -1652,8 +1640,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(),
            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
-            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
+            &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
-            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
        void* pmeTransformInducedPotentialArgs[] = {&pmePhidp->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformInducedPotentialArgs, cu.getNumAtoms());

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -392,6 +392,8 @@ private:
    CudaArray* molecularQuadrupoles;
    CudaArray* labFrameDipoles;
    CudaArray* labFrameQuadrupoles;
+    CudaArray* sphericalDipoles;
+    CudaArray* sphericalQuadrupoles;
    CudaArray* fracDipoles;
    CudaArray* fracQuadrupoles;
    CudaArray* field;

--- a/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForce.cu
-/**
- * This defines three different closely related functions, depending on which constant (F1, T1, or T3) is defined.
- */
-#if defined F1
-__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real& energy, real3& outputForce) {
-#elif defined T1
-__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
-#else
-__device__ void computeOneInteractionT3(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
-#endif
-#ifdef F1
-    const float uScale = 1;
-    real ddsc3_0 = 0;
-    real ddsc3_1 = 0;
-    real ddsc3_2 = 0;
-    real ddsc5_0 = 0;
-    real ddsc5_1 = 0;
-    real ddsc5_2 = 0;
-    real ddsc7_0 = 0;
-    real ddsc7_1 = 0;
-    real ddsc7_2 = 0;
-#endif
-    real xr = atom2.posq.x - atom1.posq.x;
-    real yr = atom2.posq.y - atom1.posq.y;
-    real zr = atom2.posq.z - atom1.posq.z;
-    real r2 = xr*xr + yr*yr + zr*zr;
-    real r = SQRT(r2);
-    real rr1 = RECIP(r);
-    real rr2 = rr1*rr1;
-    real rr3 = rr1*rr2;
-    real rr5 = 3*rr3*rr2;
-    real rr7 = 5*rr5*rr2;
-    real rr9 = 7*rr7*rr2;
-#ifdef F1
-    real rr11 = 9*rr9*rr2;
-#endif
-    real scale3 = 1;
-    real scale5 = 1;
-    real scale7 = 1;
-    real pdamp = atom1.damp*atom2.damp;
-    if (pdamp != 0) {
-        real ratio = r/pdamp;
-        float pGamma = atom2.thole > atom1.thole ? atom1.thole : atom2.thole;
-        real damp = ratio*ratio*ratio*pGamma;
-        real dampExp = EXP(-damp);
-        real damp1 = damp + 1;
-        real damp2 = damp*damp;
-        scale3 = 1 - dampExp;
-        scale5 = 1 - damp1*dampExp;
-        scale7 = 1 - (damp1 + 0.6f*damp2)*dampExp;
-#ifdef F1
-        real factor = 3*damp*dampExp*rr2;
-        real factor7 = -0.2f + 0.6f*damp;
-        ddsc3_0 = factor*xr;
-        ddsc5_0 = ddsc3_0*damp;
-        ddsc7_0 = ddsc5_0*factor7;
-        ddsc3_1 = factor*yr;
-        ddsc5_1 = ddsc3_1*damp;
-        ddsc7_1 = ddsc5_1*factor7;
-        ddsc3_2 = factor*zr;
-        ddsc5_2 = ddsc3_2*damp;
-        ddsc7_2 = ddsc5_2*factor7;
-#endif
-    }
-#if defined F1
-    real scale3i = rr3*scale3*uScale;
-    real scale5i = rr5*scale5*uScale;
-#endif
-    real dsc3 = rr3*scale3*dScale;
-    real psc3 = rr3*scale3*pScale;
-    real dsc5 = rr5*scale5*dScale;
-    real psc5 = rr5*scale5*pScale;
-    real dsc7 = rr7*scale7*dScale;
-    real psc7 = rr7*scale7*pScale;
-    real atom2quadrupoleZZ = -(atom2.quadrupoleXX+atom2.quadrupoleYY);
-    real qJr_0 = atom2.quadrupoleXX*xr + atom2.quadrupoleXY*yr + atom2.quadrupoleXZ*zr;
-    real qJr_1 = atom2.quadrupoleXY*xr + atom2.quadrupoleYY*yr + atom2.quadrupoleYZ*zr;
-    real qJr_2 = atom2.quadrupoleXZ*xr + atom2.quadrupoleYZ*yr + atom2quadrupoleZZ*zr;
-    real atom1quadrupoleZZ = -(atom1.quadrupoleXX+atom1.quadrupoleYY);
-    real qIr_0 = atom1.quadrupoleXX*xr + atom1.quadrupoleXY*yr + atom1.quadrupoleXZ*zr;
-    real qIr_1 = atom1.quadrupoleXY*xr + atom1.quadrupoleYY*yr + atom1.quadrupoleYZ*zr;
-    real qIr_2 = atom1.quadrupoleXZ*xr + atom1.quadrupoleYZ*yr + atom1quadrupoleZZ*zr;
-#if defined F1
-    real sc2 = atom1.dipole.x*atom2.dipole.x + atom1.dipole.y*atom2.dipole.y + atom1.dipole.z*atom2.dipole.z;
-#endif
-#if defined F1 || defined T1
-    real sc4 = atom2.dipole.x*xr + atom2.dipole.y*yr + atom2.dipole.z*zr;
-    real sc6 = qJr_0*xr + qJr_1*yr + qJr_2*zr;
-#endif
-#if defined F1 || defined T3
-    real sc3 = atom1.dipole.x*xr + atom1.dipole.y*yr + atom1.dipole.z*zr;
-    real sc5 = qIr_0*xr + qIr_1*yr + qIr_2*zr;
-#endif
-#if defined F1
-    real sc7 = qIr_0*atom2.dipole.x + qIr_1*atom2.dipole.y + qIr_2*atom2.dipole.z;
-    real sc8 = qJr_0*atom1.dipole.x + qJr_1*atom1.dipole.y + qJr_2*atom1.dipole.z;
-    real sc9 = qIr_0*qJr_0 + qIr_1*qJr_1 + qIr_2*qJr_2;
-    real sc10 = atom1.quadrupoleXX*atom2.quadrupoleXX + atom1.quadrupoleXY*atom2.quadrupoleXY + atom1.quadrupoleXZ*atom2.quadrupoleXZ +
-                atom1.quadrupoleXY*atom2.quadrupoleXY + atom1.quadrupoleYY*atom2.quadrupoleYY + atom1.quadrupoleYZ*atom2.quadrupoleYZ +
-                atom1.quadrupoleXZ*atom2.quadrupoleXZ + atom1.quadrupoleYZ*atom2.quadrupoleYZ + atom1quadrupoleZZ*atom2quadrupoleZZ;
-    real sci1 = atom1.inducedDipole.x*atom2.dipole.x + atom1.inducedDipole.y*atom2.dipole.y + atom1.inducedDipole.z*atom2.dipole.z +
-                atom2.inducedDipole.x*atom1.dipole.x + atom2.inducedDipole.y*atom1.dipole.y + atom2.inducedDipole.z*atom1.dipole.z;
-#endif
-#if defined F1 || defined T3
-    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
-#endif
-#if defined F1
-    real sci7 = qIr_0*atom2.inducedDipole.x + qIr_1*atom2.inducedDipole.y + qIr_2*atom2.inducedDipole.z;
-    real sci8 = qJr_0*atom1.inducedDipole.x + qJr_1*atom1.inducedDipole.y + qJr_2*atom1.inducedDipole.z;
-#endif
-#if defined F1 || defined T1
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-#endif
-#if defined F1
-    real scip1 = atom1.inducedDipolePolar.x*atom2.dipole.x + atom1.inducedDipolePolar.y*atom2.dipole.y + atom1.inducedDipolePolar.z*atom2.dipole.z +
-                 atom2.inducedDipolePolar.x*atom1.dipole.x + atom2.inducedDipolePolar.y*atom1.dipole.y + atom2.inducedDipolePolar.z*atom1.dipole.z;
-    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x + atom1.inducedDipole.y*atom2.inducedDipolePolar.y + atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
-                 atom2.inducedDipole.x*atom1.inducedDipolePolar.x + atom2.inducedDipole.y*atom1.inducedDipolePolar.y + atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
-#endif
-#if defined F1 || defined T3
-    real scip3 = ((atom1.inducedDipolePolar.x)*(xr) + (atom1.inducedDipolePolar.y)*(yr) + (atom1.inducedDipolePolar.z)*(zr));
-#endif
-#if defined F1 || defined T1
-    real scip4 = ((atom2.inducedDipolePolar.x)*(xr) + (atom2.inducedDipolePolar.y)*(yr) + (atom2.inducedDipolePolar.z)*(zr));
-#endif
-#ifdef F1
-    real scip7 = ((qIr_0)*(atom2.inducedDipolePolar.x) + (qIr_1)*(atom2.inducedDipolePolar.y) + (qIr_2)*(atom2.inducedDipolePolar.z));
-    real scip8 = ((qJr_0)*(atom1.inducedDipolePolar.x) + (qJr_1)*(atom1.inducedDipolePolar.y) + (qJr_2)*(atom1.inducedDipolePolar.z));
-    real gli1 = atom2.posq.w*sci3 - atom1.posq.w*sci4;
-    real gli6 = sci1;
-    real glip1 = atom2.posq.w*scip3 - atom1.posq.w*scip4;
-    real glip6 = scip1;
-    real gli2 = -sc3*sci4 - sci3*sc4;
-    real gli3 = sci3*sc6 - sci4*sc5;
-    real gli7 = 2*(sci7-sci8);
-    real glip2 = -sc3*scip4 - scip3*sc4;
-    real glip3 = scip3*sc6 - scip4*sc5;
-    real glip7 = 2*(scip7-scip8);
-    real factor3 = rr3*((gli1  +  gli6)*pScale + (glip1  + glip6)*dScale);
-    real factor5 = rr5*((gli2  +  gli7)*pScale + (glip2  + glip7)*dScale);
-    real factor7 = rr7*(gli3*pScale + glip3*dScale);
-    real ftm2i_0 = -0.5f*(factor3*ddsc3_0 + factor5*ddsc5_0 + factor7*ddsc7_0);
-    real ftm2i_1 = -0.5f*(factor3*ddsc3_1 + factor5*ddsc5_1 + factor7*ddsc7_1);
-    real ftm2i_2 = -0.5f*(factor3*ddsc3_2 + factor5*ddsc5_2 + factor7*ddsc7_2);
-    real gl0 = atom1.posq.w*atom2.posq.w;
-    real gl1 = atom2.posq.w*sc3 - atom1.posq.w*sc4;
-    real gl2 = atom1.posq.w*sc6 + atom2.posq.w*sc5 - sc3*sc4;
-    real gl3 = sc3*sc6 - sc4*sc5;
-    real gl4 = sc5*sc6;
-    real gl6 = sc2;
-    real gl7 = 2*(sc7-sc8);
-    real gl8 = 2*sc10;
-    real gl5 = -4*sc9;
-    real gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4;
-#endif
-#if defined F1 || defined T1
-    real gf2 = -atom2.posq.w*rr3 + sc4*rr5 - sc6*rr7;
-    real gf5 = 2*(-atom2.posq.w*rr5+sc4*rr7-sc6*rr9);
-#endif
-#if defined F1 || defined T3
-    real gf3 =  atom1.posq.w*rr3 + sc3*rr5 + sc5*rr7;
-    real gf6 = 2*(-atom1.posq.w*rr5-sc3*rr7-sc5*rr9);
-#endif
-#ifdef F1
-    real em = mScale*(rr1*gl0 + rr3*(gl1+gl6) + rr5*(gl2+gl7+gl8) + rr7*(gl3+gl5) + rr9*gl4);
-    real ei = 0.5f*((gli1+gli6)*psc3 + (gli2+gli7)*psc5 + gli3*psc7);
-    energy = em+ei;
-#endif
-#if defined F1 || defined T1
-    real qIdJ_0 = atom1.quadrupoleXX*atom2.dipole.x + atom1.quadrupoleXY*atom2.dipole.y + atom1.quadrupoleXZ*atom2.dipole.z;
-    real qIdJ_1 = atom1.quadrupoleXY*atom2.dipole.x + atom1.quadrupoleYY*atom2.dipole.y + atom1.quadrupoleYZ*atom2.dipole.z;
-    real qIdJ_2 = atom1.quadrupoleXZ*atom2.dipole.x + atom1.quadrupoleYZ*atom2.dipole.y + atom1quadrupoleZZ*atom2.dipole.z;
-    real qIqJr_0 = atom1.quadrupoleXX*qJr_0 + atom1.quadrupoleXY*qJr_1 + atom1.quadrupoleXZ*qJr_2;
-    real qIqJr_1 = atom1.quadrupoleXY*qJr_0 + atom1.quadrupoleYY*qJr_1 + atom1.quadrupoleYZ*qJr_2;
-    real qIqJr_2 = atom1.quadrupoleXZ*qJr_0 + atom1.quadrupoleYZ*qJr_1 + atom1quadrupoleZZ*qJr_2;
-#endif
-#ifdef F1
-    real qkqir_0 = atom2.quadrupoleXX*qIr_0 + atom2.quadrupoleXY*qIr_1 + atom2.quadrupoleXZ*qIr_2;
-    real qkqir_1 = atom2.quadrupoleXY*qIr_0 + atom2.quadrupoleYY*qIr_1 + atom2.quadrupoleYZ*qIr_2;
-    real qkqir_2 = atom2.quadrupoleXZ*qIr_0 + atom2.quadrupoleYZ*qIr_1 + atom2quadrupoleZZ*qIr_2;
-    real qkdi_0 = atom2.quadrupoleXX*atom1.dipole.x + atom2.quadrupoleXY*atom1.dipole.y + atom2.quadrupoleXZ*atom1.dipole.z;
-    real qkdi_1 = atom2.quadrupoleXY*atom1.dipole.x + atom2.quadrupoleYY*atom1.dipole.y + atom2.quadrupoleYZ*atom1.dipole.z;
-    real qkdi_2 = atom2.quadrupoleXZ*atom1.dipole.x + atom2.quadrupoleYZ*atom1.dipole.y + atom2quadrupoleZZ*atom1.dipole.z;
-    real ftm2_0 = mScale*(gf1*xr + gf2*atom1.dipole.x + gf3*atom2.dipole.x + 2*rr5*(qkdi_0 - qIdJ_0) + gf5*qIr_0 + gf6*qJr_0 + 4*rr7*(qIqJr_0 + qkqir_0));
-    real ftm2_1 = mScale*(gf1*yr + gf2*atom1.dipole.y + gf3*atom2.dipole.y + 2*rr5*(qkdi_1 - qIdJ_1) + gf5*qIr_1 + gf6*qJr_1 + 4*rr7*(qIqJr_1 + qkqir_1));
-    real ftm2_2 = mScale*(gf1*zr + gf2*atom1.dipole.z + gf3*atom2.dipole.z + 2*rr5*(qkdi_2 - qIdJ_2) + gf5*qIr_2 + gf6*qJr_2 + 4*rr7*(qIqJr_2 + qkqir_2));
-    real gfi1 = rr2*(1.5f*((gli1+gli6)*psc3 + (glip1+glip6)*dsc3 + scip2*scale3i) + 2.5f*((gli7+gli2)*psc5 + (glip7+glip2)*dsc5 - (sci3*scip4+scip3*sci4)*scale5i) + 3.5f*(gli3*psc7+glip3*dsc7));
-    ftm2i_0 += gfi1*xr;
-    ftm2i_1 += gfi1*yr;
-    ftm2i_2 += gfi1*zr;
-#endif
-#if defined F1 || defined T1
-    real gfi5 = (sci4*psc7 + scip4*dsc7);
-#endif
-#if defined F1 || defined T3
-    real gfi6 = -(sci3*psc7 + scip3*dsc7);
-#endif
-#if defined F1 || defined T1
-    real qIuJ_0 = atom1.quadrupoleXX*atom2.inducedDipole.x   + atom1.quadrupoleXY*atom2.inducedDipole.y  + atom1.quadrupoleXZ*atom2.inducedDipole.z;
-    real qIuJ_1 = atom1.quadrupoleXY*atom2.inducedDipole.x   + atom1.quadrupoleYY*atom2.inducedDipole.y  + atom1.quadrupoleYZ*atom2.inducedDipole.z;
-    real qIuJ_2 = atom1.quadrupoleXZ*atom2.inducedDipole.x   + atom1.quadrupoleYZ*atom2.inducedDipole.y  + atom1quadrupoleZZ*atom2.inducedDipole.z;
-    real qIuJp_0 = atom1.quadrupoleXX*atom2.inducedDipolePolar.x + atom1.quadrupoleXY*atom2.inducedDipolePolar.y + atom1.quadrupoleXZ*atom2.inducedDipolePolar.z;
-    real qIuJp_1 = atom1.quadrupoleXY*atom2.inducedDipolePolar.x + atom1.quadrupoleYY*atom2.inducedDipolePolar.y + atom1.quadrupoleYZ*atom2.inducedDipolePolar.z;
-    real qIuJp_2 = atom1.quadrupoleXZ*atom2.inducedDipolePolar.x + atom1.quadrupoleYZ*atom2.inducedDipolePolar.y + atom1quadrupoleZZ*atom2.inducedDipolePolar.z;
-#endif
-#if defined T3
-    real qJuIp_0 = atom2.quadrupoleXX*atom1.inducedDipolePolar.x + atom2.quadrupoleXY*atom1.inducedDipolePolar.y + atom2.quadrupoleXZ*atom1.inducedDipolePolar.z;
-    real qJuIp_1 = atom2.quadrupoleXY*atom1.inducedDipolePolar.x + atom2.quadrupoleYY*atom1.inducedDipolePolar.y + atom2.quadrupoleYZ*atom1.inducedDipolePolar.z;
-    real qJuIp_2 = atom2.quadrupoleXZ*atom1.inducedDipolePolar.x + atom2.quadrupoleYZ*atom1.inducedDipolePolar.y + atom2quadrupoleZZ*atom1.inducedDipolePolar.z;
-     real qJuI_0 = atom2.quadrupoleXX*atom1.inducedDipole.x + atom2.quadrupoleXY*atom1.inducedDipole.y + atom2.quadrupoleXZ*atom1.inducedDipole.z;
-     real qJuI_1 = atom2.quadrupoleXY*atom1.inducedDipole.x + atom2.quadrupoleYY*atom1.inducedDipole.y + atom2.quadrupoleYZ*atom1.inducedDipole.z;
-     real qJuI_2 = atom2.quadrupoleXZ*atom1.inducedDipole.x + atom2.quadrupoleYZ*atom1.inducedDipole.y + atom2quadrupoleZZ*atom1.inducedDipole.z;
-#endif
-#ifdef F1
-    real qkui_0 = atom2.quadrupoleXX*atom1.inducedDipole.x + atom2.quadrupoleXY*atom1.inducedDipole.y + atom2.quadrupoleXZ*atom1.inducedDipole.z;
-    real qkui_1 = atom2.quadrupoleXY*atom1.inducedDipole.x + atom2.quadrupoleYY*atom1.inducedDipole.y + atom2.quadrupoleYZ*atom1.inducedDipole.z;
-    real qkui_2 = atom2.quadrupoleXZ*atom1.inducedDipole.x + atom2.quadrupoleYZ*atom1.inducedDipole.y + atom2quadrupoleZZ*atom1.inducedDipole.z;
-    real qkuip_0 = atom2.quadrupoleXX*atom1.inducedDipolePolar.x + atom2.quadrupoleXY*atom1.inducedDipolePolar.y + atom2.quadrupoleXZ*atom1.inducedDipolePolar.z;
-    real qkuip_1 = atom2.quadrupoleXY*atom1.inducedDipolePolar.x + atom2.quadrupoleYY*atom1.inducedDipolePolar.y + atom2.quadrupoleYZ*atom1.inducedDipolePolar.z;
-    real qkuip_2 = atom2.quadrupoleXZ*atom1.inducedDipolePolar.x + atom2.quadrupoleYZ*atom1.inducedDipolePolar.y + atom2quadrupoleZZ*atom1.inducedDipolePolar.z;
-    ftm2i_0 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.x*psc3 + atom1.inducedDipolePolar.x*dsc3) +
-               sc4*(atom1.inducedDipole.x*psc5 + atom1.inducedDipolePolar.x*dsc5) -
-               sc6*(atom1.inducedDipole.x*psc7 + atom1.inducedDipolePolar.x*dsc7)) +
-               0.5f*(atom1.posq.w*(atom2.inducedDipole.x*psc3+atom2.inducedDipolePolar.x*dsc3) +
-               sc3*(atom2.inducedDipole.x*psc5 +atom2.inducedDipolePolar.x*dsc5) +
-               sc5*(atom2.inducedDipole.x*psc7 +atom2.inducedDipolePolar.x*dsc7)) +
-               scale5i*(sci4*atom1.inducedDipolePolar.x+scip4*atom1.inducedDipole.x +
-                        sci3*atom2.inducedDipolePolar.x+scip3*atom2.inducedDipole.x)*0.5f +
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.x +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.x +
-               ((qkui_0-qIuJ_0)*psc5 + (qkuip_0-qIuJp_0)*dsc5) +
-               gfi5*qIr_0 + gfi6*qJr_0;
-    ftm2i_1 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.y*psc3 + atom1.inducedDipolePolar.y*dsc3) +
-               sc4*(atom1.inducedDipole.y*psc5 + atom1.inducedDipolePolar.y*dsc5) -
-               sc6*(atom1.inducedDipole.y*psc7 + atom1.inducedDipolePolar.y*dsc7)) +
-               (atom1.posq.w*(atom2.inducedDipole.y*psc3+atom2.inducedDipolePolar.y*dsc3) +
-                    sc3*(atom2.inducedDipole.y*psc5+atom2.inducedDipolePolar.y*dsc5) +
-                    sc5*(atom2.inducedDipole.y*psc7+atom2.inducedDipolePolar.y*dsc7))*0.5f +
-                    scale5i*(sci4*atom1.inducedDipolePolar.y+scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y+scip3*atom2.inducedDipole.y)*0.5f +
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.y +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.y +
-               ((qkui_1-qIuJ_1)*psc5 + (qkuip_1-qIuJp_1)*dsc5) +
-               gfi5*qIr_1 + gfi6*qJr_1;
-    ftm2i_2 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.z*psc3 + atom1.inducedDipolePolar.z*dsc3) +
-               sc4*(atom1.inducedDipole.z*psc5 + atom1.inducedDipolePolar.z*dsc5) -
-               sc6*(atom1.inducedDipole.z*psc7 + atom1.inducedDipolePolar.z*dsc7)) +
-               (atom1.posq.w*(atom2.inducedDipole.z*psc3+atom2.inducedDipolePolar.z*dsc3) +
-                    sc3*(atom2.inducedDipole.z*psc5+atom2.inducedDipolePolar.z*dsc5) +
-                    sc5*(atom2.inducedDipole.z*psc7+atom2.inducedDipolePolar.z*dsc7))*0.5f +
-                    scale5i*(sci4*atom1.inducedDipolePolar.z+scip4*atom1.inducedDipole.z +
-                    sci3*atom2.inducedDipolePolar.z+scip3*atom2.inducedDipole.z)*0.5f +
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.z +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.z +
-               ((qkui_2-qIuJ_2)*psc5 + (qkuip_2-qIuJp_2)*dsc5) +
-               gfi5*qIr_2 + gfi6*qJr_2;
-#ifdef DIRECT_POLARIZATION
-    real gfd = 0.5*(3*rr2*scip2*scale3i - 5*rr2*(scip3*sci4+sci3*scip4)*scale5i);
-    real temp5 = 0.5*scale5i;
-    real fdir_0 = gfd*xr + temp5*(sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x + sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
-    real fdir_1 = gfd*yr + temp5*(sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
-    real fdir_2 = gfd*zr + temp5*(sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z + sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
-    ftm2i_0 -= fdir_0;
-    ftm2i_1 -= fdir_1;
-    ftm2i_2 -= fdir_2;
-#else
-    real scaleF = 0.5f*uScale;
-    real inducedFactor3 = scip2*rr3*scaleF;
-    real inducedFactor5 = (sci3*scip4+scip3*sci4)*rr5*scaleF;
-    real findmp_0 = inducedFactor3*ddsc3_0 - inducedFactor5*ddsc5_0;
-    real findmp_1 = inducedFactor3*ddsc3_1 - inducedFactor5*ddsc5_1;
-    real findmp_2 = inducedFactor3*ddsc3_2 - inducedFactor5*ddsc5_2;
-    ftm2i_0 -= findmp_0;
-    ftm2i_1 -= findmp_1;
-    ftm2i_2 -= findmp_2;
-#endif
-#endif
-#if defined T1
-    real gti2 = 0.5f*(sci4*psc5+scip4*dsc5);
-    real gti5 = gfi5;
-#endif
-#if defined T3
-    real gti3 = 0.5f*(sci3*psc5+scip3*dsc5);
-    real gti6 = gfi6;
-#endif
-#if defined T1 || defined T3
-    real dixdk_0 = atom1.dipole.y*atom2.dipole.z - atom1.dipole.z*atom2.dipole.y;
-    real dixdk_1 = atom1.dipole.z*atom2.dipole.x - atom1.dipole.x*atom2.dipole.z;
-    real dixdk_2 = atom1.dipole.x*atom2.dipole.y - atom1.dipole.y*atom2.dipole.x;
-#if defined T1
-    real dixuk_0 = atom1.dipole.y*atom2.inducedDipole.z - atom1.dipole.z*atom2.inducedDipole.y;
-    real dixuk_1 = atom1.dipole.z*atom2.inducedDipole.x - atom1.dipole.x*atom2.inducedDipole.z;
-    real dixuk_2 = atom1.dipole.x*atom2.inducedDipole.y - atom1.dipole.y*atom2.inducedDipole.x;
-#endif
-#endif
-#ifdef T1
-    real dixukp_0 = atom1.dipole.y*atom2.inducedDipolePolar.z - atom1.dipole.z*atom2.inducedDipolePolar.y;
-    real dixukp_1 = atom1.dipole.z*atom2.inducedDipolePolar.x - atom1.dipole.x*atom2.inducedDipolePolar.z;
-    real dixukp_2 = atom1.dipole.x*atom2.inducedDipolePolar.y - atom1.dipole.y*atom2.inducedDipolePolar.x;
-#endif
-#ifdef T1
-    real dixr_0 = atom1.dipole.y*zr - atom1.dipole.z*yr;
-    real dixr_1 = atom1.dipole.z*xr - atom1.dipole.x*zr;
-    real dixr_2 = atom1.dipole.x*yr - atom1.dipole.y*xr;
-#endif
-#ifdef T1
-    real rxqiukp_0 = yr*qIuJp_2 - zr*qIuJp_1;
-    real rxqiukp_1 = zr*qIuJp_0 - xr*qIuJp_2;
-    real rxqiukp_2 = xr*qIuJp_1 - yr*qIuJp_0;
-    real rxqir_0 = yr*qIr_2 - zr*qIr_1;
-    real rxqir_1 = zr*qIr_0 - xr*qIr_2;
-    real rxqir_2 = xr*qIr_1 - yr*qIr_0;
-    real rxqiuk_0 = yr*qIuJ_2 - zr*qIuJ_1;
-    real rxqiuk_1 = zr*qIuJ_0 - xr*qIuJ_2;
-    real rxqiuk_2 = xr*qIuJ_1 - yr*qIuJ_0;
-    real ukxqir_0 = atom2.inducedDipole.y*qIr_2 - atom2.inducedDipole.z*qIr_1;
-    real ukxqir_1 = atom2.inducedDipole.z*qIr_0 - atom2.inducedDipole.x*qIr_2;
-    real ukxqir_2 = atom2.inducedDipole.x*qIr_1 - atom2.inducedDipole.y*qIr_0;
-    real ukxqirp_0 = atom2.inducedDipolePolar.y*qIr_2 - atom2.inducedDipolePolar.z*qIr_1;
-    real ukxqirp_1 = atom2.inducedDipolePolar.z*qIr_0 - atom2.inducedDipolePolar.x*qIr_2;
-    real ukxqirp_2 = atom2.inducedDipolePolar.x*qIr_1 - atom2.inducedDipolePolar.y*qIr_0;
-    real dixqkr_0 = atom1.dipole.y*qJr_2 - atom1.dipole.z*qJr_1;
-    real dixqkr_1 = atom1.dipole.z*qJr_0 - atom1.dipole.x*qJr_2;
-    real dixqkr_2 = atom1.dipole.x*qJr_1 - atom1.dipole.y*qJr_0;
-    real dkxqir_0 = atom2.dipole.y*qIr_2 - atom2.dipole.z*qIr_1;
-    real dkxqir_1 = atom2.dipole.z*qIr_0 - atom2.dipole.x*qIr_2;
-    real dkxqir_2 = atom2.dipole.x*qIr_1 - atom2.dipole.y*qIr_0;
-    real rxqikr_0 = yr*qIqJr_2 - zr*qIqJr_1;
-    real rxqikr_1 = zr*qIqJr_0 - xr*qIqJr_2;
-    real rxqikr_2 = xr*qIqJr_1 - yr*qIqJr_0;
-    real rxqidk_0 = yr*qIdJ_2 - zr*qIdJ_1;
-    real rxqidk_1 = zr*qIdJ_0 - xr*qIdJ_2;
-    real rxqidk_2 = xr*qIdJ_1 - yr*qIdJ_0;
-    real qkrxqir_0 = qJr_1*qIr_2 - qJr_2*qIr_1;
-    real qkrxqir_1 = qJr_2*qIr_0 - qJr_0*qIr_2;
-    real qkrxqir_2 = qJr_0*qIr_1 - qJr_1*qIr_0;
-#endif
-#if defined T1 || defined T3
-    real qixqk_0 = atom1.quadrupoleXY*atom2.quadrupoleXZ + atom1.quadrupoleYY*atom2.quadrupoleYZ + atom1.quadrupoleYZ*atom2quadrupoleZZ -
-                   atom1.quadrupoleXZ*atom2.quadrupoleXY - atom1.quadrupoleYZ*atom2.quadrupoleYY - atom1quadrupoleZZ*atom2.quadrupoleYZ;
-    real qixqk_1 = atom1.quadrupoleXZ*atom2.quadrupoleXX + atom1.quadrupoleYZ*atom2.quadrupoleXY + atom1quadrupoleZZ*atom2.quadrupoleXZ -
-                   atom1.quadrupoleXX*atom2.quadrupoleXZ - atom1.quadrupoleXY*atom2.quadrupoleYZ - atom1.quadrupoleXZ*atom2quadrupoleZZ;
-    real qixqk_2 = atom1.quadrupoleXX*atom2.quadrupoleXY + atom1.quadrupoleXY*atom2.quadrupoleYY + atom1.quadrupoleXZ*atom2.quadrupoleYZ -
-                   atom1.quadrupoleXY*atom2.quadrupoleXX - atom1.quadrupoleYY*atom2.quadrupoleXY - atom1.quadrupoleYZ*atom2.quadrupoleXZ;
-#endif
-#ifdef T1
-    real ttm2_0 = -rr3*dixdk_0 + gf2*dixr_0-gf5*rxqir_0 + 2*rr5*(dixqkr_0 + dkxqir_0 + rxqidk_0-2*qixqk_0) - 4*rr7*(rxqikr_0 + qkrxqir_0);
-    real ttm2_1 = -rr3*dixdk_1 + gf2*dixr_1-gf5*rxqir_1 + 2*rr5*(dixqkr_1 + dkxqir_1 + rxqidk_1-2*qixqk_1) - 4*rr7*(rxqikr_1 + qkrxqir_1);
-    real ttm2_2 = -rr3*dixdk_2 + gf2*dixr_2-gf5*rxqir_2 + 2*rr5*(dixqkr_2 + dkxqir_2 + rxqidk_2-2*qixqk_2) - 4*rr7*(rxqikr_2 + qkrxqir_2);
-    real ttm2i_0 = -(dixuk_0*psc3+dixukp_0*dsc3)*0.5f + gti2*dixr_0 + ((ukxqir_0+ rxqiuk_0)*psc5 + (ukxqirp_0 + rxqiukp_0)*dsc5) - gti5*rxqir_0;
-    real ttm2i_1 = -(dixuk_1*psc3+dixukp_1*dsc3)*0.5f + gti2*dixr_1 + ((ukxqir_1+ rxqiuk_1)*psc5 + (ukxqirp_1 + rxqiukp_1)*dsc5) - gti5*rxqir_1;
-    real ttm2i_2 = -(dixuk_2*psc3+dixukp_2*dsc3)*0.5f + gti2*dixr_2 + ((ukxqir_2+ rxqiuk_2)*psc5 + (ukxqirp_2 + rxqiukp_2)*dsc5) - gti5*rxqir_2;
-#endif
-#ifdef T3
-    real qJqIr_0 = atom2.quadrupoleXX*qIr_0 + atom2.quadrupoleXY*qIr_1 + atom2.quadrupoleXZ*qIr_2;
-    real qJqIr_1 = atom2.quadrupoleXY*qIr_0 + atom2.quadrupoleYY*qIr_1 + atom2.quadrupoleYZ*qIr_2;
-    real qJqIr_2 = atom2.quadrupoleXZ*qIr_0 + atom2.quadrupoleYZ*qIr_1 + atom2quadrupoleZZ*qIr_2;
-    real qJdI_0 = atom2.quadrupoleXX*atom1.dipole.x + atom2.quadrupoleXY*atom1.dipole.y + atom2.quadrupoleXZ*atom1.dipole.z;
-    real qJdI_1 = atom2.quadrupoleXY*atom1.dipole.x + atom2.quadrupoleYY*atom1.dipole.y + atom2.quadrupoleYZ*atom1.dipole.z;
-    real qJdI_2 = atom2.quadrupoleXZ*atom1.dipole.x + atom2.quadrupoleYZ*atom1.dipole.y + atom2quadrupoleZZ*atom1.dipole.z;
-    real dkxr_0 = atom2.dipole.y*zr - atom2.dipole.z*yr;
-    real dkxr_1 = atom2.dipole.z*xr - atom2.dipole.x*zr;
-    real dkxr_2 = atom2.dipole.x*yr - atom2.dipole.y*xr;
-    real rxqkr_0 = yr*qJr_2 - zr*qJr_1;
-    real rxqkr_1 = zr*qJr_0 - xr*qJr_2;
-    real rxqkr_2 = xr*qJr_1 - yr*qJr_0;
-    real dixqkr_0 = atom1.dipole.y*qJr_2 - atom1.dipole.z*qJr_1;
-    real dixqkr_1 = atom1.dipole.z*qJr_0 - atom1.dipole.x*qJr_2;
-    real dixqkr_2 = atom1.dipole.x*qJr_1 - atom1.dipole.y*qJr_0;
-    real dkxqir_0 = atom2.dipole.y*qIr_2 - atom2.dipole.z*qIr_1;
-    real dkxqir_1 = atom2.dipole.z*qIr_0 - atom2.dipole.x*qIr_2;
-    real dkxqir_2 = atom2.dipole.x*qIr_1 - atom2.dipole.y*qIr_0;
-    real rxqkdi_0 = yr*qJdI_2 - zr*qJdI_1;
-    real rxqkdi_1 = zr*qJdI_0 - xr*qJdI_2;
-    real rxqkdi_2 = xr*qJdI_1 - yr*qJdI_0;
-    real rxqkir_0 = yr*qJqIr_2 - zr*qJqIr_1;
-    real rxqkir_1 = zr*qJqIr_0 - xr*qJqIr_2;
-    real rxqkir_2 = xr*qJqIr_1 - yr*qJqIr_0;
-    real qkrxqir_0 = qJr_1*qIr_2 - qJr_2*qIr_1;
-    real qkrxqir_1 = qJr_2*qIr_0 - qJr_0*qIr_2;
-    real qkrxqir_2 = qJr_0*qIr_1 - qJr_1*qIr_0;
-    real dkxui_0 = atom2.dipole.y*atom1.inducedDipole.z - atom2.dipole.z*atom1.inducedDipole.y;
-    real dkxui_1 = atom2.dipole.z*atom1.inducedDipole.x - atom2.dipole.x*atom1.inducedDipole.z; 
-    real dkxui_2 = atom2.dipole.x*atom1.inducedDipole.y - atom2.dipole.y*atom1.inducedDipole.x;
-    real dkxuip_0 = atom2.dipole.y*atom1.inducedDipolePolar.z - atom2.dipole.z*atom1.inducedDipolePolar.y;
-    real dkxuip_1 = atom2.dipole.z*atom1.inducedDipolePolar.x - atom2.dipole.x*atom1.inducedDipolePolar.z;
-    real dkxuip_2 = atom2.dipole.x*atom1.inducedDipolePolar.y - atom2.dipole.y*atom1.inducedDipolePolar.x;
-    real uixqkrp_0 = atom1.inducedDipolePolar.y*qJr_2 - atom1.inducedDipolePolar.z*qJr_1;
-    real uixqkrp_1 = atom1.inducedDipolePolar.z*qJr_0 - atom1.inducedDipolePolar.x*qJr_2;
-    real uixqkrp_2 = atom1.inducedDipolePolar.x*qJr_1 - atom1.inducedDipolePolar.y*qJr_0;
-    real uixqkr_0 = atom1.inducedDipole.y*qJr_2 - atom1.inducedDipole.z*qJr_1;
-    real uixqkr_1 = atom1.inducedDipole.z*qJr_0 - atom1.inducedDipole.x*qJr_2;
-    real uixqkr_2 = atom1.inducedDipole.x*qJr_1 - atom1.inducedDipole.y*qJr_0;
-    real rxqkuip_0 = yr*qJuIp_2 - zr*qJuIp_1;
-    real rxqkuip_1 = zr*qJuIp_0 - xr*qJuIp_2;
-    real rxqkuip_2 = xr*qJuIp_1 - yr*qJuIp_0;
-    real rxqkui_0 = yr*qJuI_2 - zr*qJuI_1;
-    real rxqkui_1 = zr*qJuI_0 - xr*qJuI_2;
-    real rxqkui_2 = xr*qJuI_1 - yr*qJuI_0;
-    real ttm3_0 =  rr3*dixdk_0 + gf3*dkxr_0 - gf6*rxqkr_0 - 2*rr5*(dixqkr_0 + dkxqir_0 + rxqkdi_0 - 2*qixqk_0) - 4*rr7*(rxqkir_0 - qkrxqir_0);
-    real ttm3_1 =  rr3*dixdk_1 + gf3*dkxr_1 - gf6*rxqkr_1 - 2*rr5*(dixqkr_1 + dkxqir_1 + rxqkdi_1 - 2*qixqk_1) - 4*rr7*(rxqkir_1 - qkrxqir_1);
-    real ttm3_2 =  rr3*dixdk_2 + gf3*dkxr_2 - gf6*rxqkr_2 - 2*rr5*(dixqkr_2 + dkxqir_2 + rxqkdi_2 - 2*qixqk_2) - 4*rr7*(rxqkir_2 - qkrxqir_2);
-    real ttm3i_0 = -(dkxui_0*psc3+ dkxuip_0*dsc3)*0.5f + gti3*dkxr_0 - ((uixqkr_0 + rxqkui_0)*psc5 + (uixqkrp_0 + rxqkuip_0)*dsc5) - gti6*rxqkr_0;
-    real ttm3i_1 = -(dkxui_1*psc3+ dkxuip_1*dsc3)*0.5f + gti3*dkxr_1 - ((uixqkr_1 + rxqkui_1)*psc5 + (uixqkrp_1 + rxqkuip_1)*dsc5) - gti6*rxqkr_1;
-    real ttm3i_2 = -(dkxui_2*psc3+ dkxuip_2*dsc3)*0.5f + gti3*dkxr_2 - ((uixqkr_2 + rxqkui_2)*psc5 + (uixqkrp_2 + rxqkuip_2)*dsc5) - gti6*rxqkr_2;
-#endif
-    if (mScale < 1) {
-#ifdef T1
-        ttm2_0 *= mScale;
-        ttm2_1 *= mScale;
-        ttm2_2 *= mScale;
-#endif
-#ifdef T3
-        ttm3_0 *= mScale;
-        ttm3_1 *= mScale;
-        ttm3_2 *= mScale;
-#endif
-    }
-#ifdef F1
-    outputForce.x = -(ftm2_0+ftm2i_0);
-    outputForce.y = -(ftm2_1+ftm2i_1);
-    outputForce.z = -(ftm2_2+ftm2i_2);
-#endif
-#ifdef T1
-    outputForce.x = (ttm2_0 + ttm2i_0);
-    outputForce.y = (ttm2_1 + ttm2i_1);
-    outputForce.z = (ttm2_2 + ttm2i_2);
-#endif
-#ifdef T3
-    outputForce.x = (ttm3_0 + ttm3i_0);
-    outputForce.y = (ttm3_1 + ttm3i_1);
-    outputForce.z = (ttm3_2 + ttm3i_2);
-#endif
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForceNoQuadrupoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForceNoQuadrupoles.cu
-/**
- * This defines three different closely related functions, depending on which constant (F1, T1, or T3) is defined.
- */
-#if defined F1
-__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real& energy, real3& outputForce) {
-#elif defined T1
-__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
-#else
-__device__ void computeOneInteractionT3(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
-#endif
-#ifdef F1
-    const float uScale = 1;
-    real ddsc3_0 = 0;
-    real ddsc3_1 = 0;
-    real ddsc3_2 = 0;
-    real ddsc5_0 = 0;
-    real ddsc5_1 = 0;
-    real ddsc5_2 = 0;
-    real ddsc7_0 = 0;
-    real ddsc7_1 = 0;
-    real ddsc7_2 = 0;
-#endif
-    real xr = atom2.posq.x - atom1.posq.x;
-    real yr = atom2.posq.y - atom1.posq.y;
-    real zr = atom2.posq.z - atom1.posq.z;
-    real r2 = xr*xr + yr*yr + zr*zr;
-    real r = SQRT(r2);
-    real rr1 = RECIP(r);
-    real rr2 = rr1*rr1;
-    real rr3 = rr1*rr2;
-    real rr5 = 3*rr3*rr2;
-    real rr7 = 5*rr5*rr2;
-    real rr9 = 7*rr7*rr2;
-#ifdef F1
-    real rr11 = 9*rr9*rr2;
-#endif
-    real scale3 = 1;
-    real scale5 = 1;
-    real scale7 = 1;
-    real pdamp = atom1.damp*atom2.damp;
-    if (pdamp != 0) {
-        real ratio = r/pdamp;
-        float pGamma = atom2.thole > atom1.thole ? atom1.thole : atom2.thole;
-        real damp = ratio*ratio*ratio*pGamma;
-        real dampExp = EXP(-damp);
-        real damp1 = damp + 1;
-        real damp2 = damp*damp;
-        scale3 = 1 - dampExp;
-        scale5 = 1 - damp1*dampExp;
-        scale7 = 1 - (damp1 + 0.6f*damp2)*dampExp;
-#ifdef F1
-        real factor = 3*damp*dampExp*rr2;
-        real factor7 = -0.2f + 0.6f*damp;
-        ddsc3_0 = factor*xr;
-        ddsc5_0 = ddsc3_0*damp;
-        ddsc7_0 = ddsc5_0*factor7;
-        ddsc3_1 = factor*yr;
-        ddsc5_1 = ddsc3_1*damp;
-        ddsc7_1 = ddsc5_1*factor7;
-        ddsc3_2 = factor*zr;
-        ddsc5_2 = ddsc3_2*damp;
-        ddsc7_2 = ddsc5_2*factor7;
-#endif
-    }
-#if defined F1
-    real scale3i = rr3*scale3*uScale;
-    real scale5i = rr5*scale5*uScale;
-#endif
-    real dsc3 = rr3*scale3*dScale;
-    real psc3 = rr3*scale3*pScale;
-    real dsc5 = rr5*scale5*dScale;
-    real psc5 = rr5*scale5*pScale;
-    real dsc7 = rr7*scale7*dScale;
-    real psc7 = rr7*scale7*pScale;
-#if defined F1
-    real sc2 = atom1.dipole.x*atom2.dipole.x + atom1.dipole.y*atom2.dipole.y + atom1.dipole.z*atom2.dipole.z;
-#endif
-#if defined F1 || defined T1
-    real sc4 = atom2.dipole.x*xr + atom2.dipole.y*yr + atom2.dipole.z*zr;
-#endif
-#if defined F1 || defined T3
-    real sc3 = atom1.dipole.x*xr + atom1.dipole.y*yr + atom1.dipole.z*zr;
-#endif
-#if defined F1
-    real sci1 = atom1.inducedDipole.x*atom2.dipole.x + atom1.inducedDipole.y*atom2.dipole.y + atom1.inducedDipole.z*atom2.dipole.z +
-                atom2.inducedDipole.x*atom1.dipole.x + atom2.inducedDipole.y*atom1.dipole.y + atom2.inducedDipole.z*atom1.dipole.z;
-#endif
-#if defined F1 || defined T3
-    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
-#endif
-#if defined F1 || defined T1
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-#endif
-#if defined F1
-    real scip1 = atom1.inducedDipolePolar.x*atom2.dipole.x + atom1.inducedDipolePolar.y*atom2.dipole.y + atom1.inducedDipolePolar.z*atom2.dipole.z +
-                 atom2.inducedDipolePolar.x*atom1.dipole.x + atom2.inducedDipolePolar.y*atom1.dipole.y + atom2.inducedDipolePolar.z*atom1.dipole.z;
-    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x + atom1.inducedDipole.y*atom2.inducedDipolePolar.y + atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
-                 atom2.inducedDipole.x*atom1.inducedDipolePolar.x + atom2.inducedDipole.y*atom1.inducedDipolePolar.y + atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
-#endif
-#if defined F1 || defined T3
-    real scip3 = ((atom1.inducedDipolePolar.x)*(xr) + (atom1.inducedDipolePolar.y)*(yr) + (atom1.inducedDipolePolar.z)*(zr));
-#endif
-#if defined F1 || defined T1
-    real scip4 = ((atom2.inducedDipolePolar.x)*(xr) + (atom2.inducedDipolePolar.y)*(yr) + (atom2.inducedDipolePolar.z)*(zr));
-#endif
-#ifdef F1
-    real gli1 = atom2.posq.w*sci3 - atom1.posq.w*sci4;
-    real gli6 = sci1;
-    real glip1 = atom2.posq.w*scip3 - atom1.posq.w*scip4;
-    real glip6 = scip1;
-    real gli2 = -sc3*sci4 - sci3*sc4;
-    real glip2 = -sc3*scip4 - scip3*sc4;
-    real factor3 = rr3*((gli1  +  gli6)*pScale + (glip1  + glip6)*dScale);
-    real factor5 = rr5*(gli2*pScale + glip2*dScale);
-    real ftm2i_0 = -0.5f*(factor3*ddsc3_0 + factor5*ddsc5_0);
-    real ftm2i_1 = -0.5f*(factor3*ddsc3_1 + factor5*ddsc5_1);
-    real ftm2i_2 = -0.5f*(factor3*ddsc3_2 + factor5*ddsc5_2);
-    real gl0 = atom1.posq.w*atom2.posq.w;
-    real gl1 = atom2.posq.w*sc3 - atom1.posq.w*sc4;
-    real gl2 = -sc3*sc4;
-    real gl6 = sc2;
-    real gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*gl2;
-#endif
-#if defined F1 || defined T1
-    real gf2 = -atom2.posq.w*rr3 + sc4*rr5;
-    real gf5 = 2*(-atom2.posq.w*rr5+sc4*rr7);
-#endif
-#if defined F1 || defined T3
-    real gf3 =  atom1.posq.w*rr3 + sc3*rr5;
-    real gf6 = 2*(-atom1.posq.w*rr5-sc3*rr7);
-#endif
-#ifdef F1
-    real em = mScale*(rr1*gl0 + rr3*(gl1+gl6) + rr5*gl2);
-    real ei = 0.5f*((gli1+gli6)*psc3 + gli2*psc5);
-    energy = em+ei;
-#endif
-#ifdef F1
-    real ftm2_0 = mScale*(gf1*xr + gf2*atom1.dipole.x + gf3*atom2.dipole.x);
-    real ftm2_1 = mScale*(gf1*yr + gf2*atom1.dipole.y + gf3*atom2.dipole.y);
-    real ftm2_2 = mScale*(gf1*zr + gf2*atom1.dipole.z + gf3*atom2.dipole.z);
-    real gfi1 = rr2*(1.5f*((gli1+gli6)*psc3 + (glip1+glip6)*dsc3 + scip2*scale3i) + 2.5f*(gli2*psc5 + glip2*dsc5 - (sci3*scip4+scip3*sci4)*scale5i));
-    ftm2i_0 += gfi1*xr;
-    ftm2i_1 += gfi1*yr;
-    ftm2i_2 += gfi1*zr;
-#endif
-#if defined F1 || defined T1
-    real gfi5 = (sci4*psc7 + scip4*dsc7);
-#endif
-#if defined F1 || defined T3
-    real gfi6 = -(sci3*psc7 + scip3*dsc7);
-#endif
-#ifdef F1
-    ftm2i_0 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.x*psc3 + atom1.inducedDipolePolar.x*dsc3) +
-               sc4*(atom1.inducedDipole.x*psc5 + atom1.inducedDipolePolar.x*dsc5)) +
-               0.5f*(atom1.posq.w*(atom2.inducedDipole.x*psc3+atom2.inducedDipolePolar.x*dsc3) +
-               sc3*(atom2.inducedDipole.x*psc5 +atom2.inducedDipolePolar.x*dsc5)) +
-               scale5i*(sci4*atom1.inducedDipolePolar.x+scip4*atom1.inducedDipole.x +
-                        sci3*atom2.inducedDipolePolar.x+scip3*atom2.inducedDipole.x)*0.5f +
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.x +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.x;
-    ftm2i_1 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.y*psc3 + atom1.inducedDipolePolar.y*dsc3) +
-               sc4*(atom1.inducedDipole.y*psc5 + atom1.inducedDipolePolar.y*dsc5)) +
-               (atom1.posq.w*(atom2.inducedDipole.y*psc3+atom2.inducedDipolePolar.y*dsc3) +
-                    sc3*(atom2.inducedDipole.y*psc5+atom2.inducedDipolePolar.y*dsc5))*0.5f +
-                    scale5i*(sci4*atom1.inducedDipolePolar.y+scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y+scip3*atom2.inducedDipole.y)*0.5f +
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.y +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.y;
-    ftm2i_2 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.z*psc3 + atom1.inducedDipolePolar.z*dsc3) +
-               sc4*(atom1.inducedDipole.z*psc5 + atom1.inducedDipolePolar.z*dsc5)) +
-               (atom1.posq.w*(atom2.inducedDipole.z*psc3+atom2.inducedDipolePolar.z*dsc3) +
-                    sc3*(atom2.inducedDipole.z*psc5+atom2.inducedDipolePolar.z*dsc5))*0.5f +
-                    scale5i*(sci4*atom1.inducedDipolePolar.z+scip4*atom1.inducedDipole.z +
-                    sci3*atom2.inducedDipolePolar.z+scip3*atom2.inducedDipole.z)*0.5f +
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.z +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.z;
-#ifdef DIRECT_POLARIZATION
-    real gfd = 0.5*(3*rr2*scip2*scale3i - 5*rr2*(scip3*sci4+sci3*scip4)*scale5i);
-    real temp5 = 0.5*scale5i;
-    real fdir_0 = gfd*xr + temp5*(sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x + sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
-    real fdir_1 = gfd*yr + temp5*(sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
-    real fdir_2 = gfd*zr + temp5*(sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z + sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
-    ftm2i_0 -= fdir_0;
-    ftm2i_1 -= fdir_1;
-    ftm2i_2 -= fdir_2;
-#else
-    real scaleF = 0.5f*uScale;
-    real inducedFactor3 = scip2*rr3*scaleF;
-    real inducedFactor5 = (sci3*scip4+scip3*sci4)*rr5*scaleF;
-    real findmp_0 = inducedFactor3*ddsc3_0 - inducedFactor5*ddsc5_0;
-    real findmp_1 = inducedFactor3*ddsc3_1 - inducedFactor5*ddsc5_1;
-    real findmp_2 = inducedFactor3*ddsc3_2 - inducedFactor5*ddsc5_2;
-    ftm2i_0 -= findmp_0;
-    ftm2i_1 -= findmp_1;
-    ftm2i_2 -= findmp_2;
-#endif
-#endif
-#if defined T1
-    real gti2 = 0.5f*(sci4*psc5+scip4*dsc5);
-    real gti5 = gfi5;
-#endif
-#if defined T3
-    real gti3 = 0.5f*(sci3*psc5+scip3*dsc5);
-    real gti6 = gfi6;
-#endif
-#if defined T1 || defined T3
-    real dixdk_0 = atom1.dipole.y*atom2.dipole.z - atom1.dipole.z*atom2.dipole.y;
-    real dixdk_1 = atom1.dipole.z*atom2.dipole.x - atom1.dipole.x*atom2.dipole.z;
-    real dixdk_2 = atom1.dipole.x*atom2.dipole.y - atom1.dipole.y*atom2.dipole.x;
-#if defined T1
-    real dixuk_0 = atom1.dipole.y*atom2.inducedDipole.z - atom1.dipole.z*atom2.inducedDipole.y;
-    real dixuk_1 = atom1.dipole.z*atom2.inducedDipole.x - atom1.dipole.x*atom2.inducedDipole.z;
-    real dixuk_2 = atom1.dipole.x*atom2.inducedDipole.y - atom1.dipole.y*atom2.inducedDipole.x;
-#endif
-#endif
-#ifdef T1
-    real dixukp_0 = atom1.dipole.y*atom2.inducedDipolePolar.z - atom1.dipole.z*atom2.inducedDipolePolar.y;
-    real dixukp_1 = atom1.dipole.z*atom2.inducedDipolePolar.x - atom1.dipole.x*atom2.inducedDipolePolar.z;
-    real dixukp_2 = atom1.dipole.x*atom2.inducedDipolePolar.y - atom1.dipole.y*atom2.inducedDipolePolar.x;
-#endif
-#ifdef T1
-    real dixr_0 = atom1.dipole.y*zr - atom1.dipole.z*yr;
-    real dixr_1 = atom1.dipole.z*xr - atom1.dipole.x*zr;
-    real dixr_2 = atom1.dipole.x*yr - atom1.dipole.y*xr;
-#endif
-#ifdef T1
-    real ttm2_0 = -rr3*dixdk_0 + gf2*dixr_0;
-    real ttm2_1 = -rr3*dixdk_1 + gf2*dixr_1;
-    real ttm2_2 = -rr3*dixdk_2 + gf2*dixr_2;
-    real ttm2i_0 = -(dixuk_0*psc3+dixukp_0*dsc3)*0.5f + gti2*dixr_0;
-    real ttm2i_1 = -(dixuk_1*psc3+dixukp_1*dsc3)*0.5f + gti2*dixr_1;
-    real ttm2i_2 = -(dixuk_2*psc3+dixukp_2*dsc3)*0.5f + gti2*dixr_2;
-#endif
-#ifdef T3
-    real dkxr_0 = atom2.dipole.y*zr - atom2.dipole.z*yr;
-    real dkxr_1 = atom2.dipole.z*xr - atom2.dipole.x*zr;
-    real dkxr_2 = atom2.dipole.x*yr - atom2.dipole.y*xr;
-    real dkxui_0 = atom2.dipole.y*atom1.inducedDipole.z - atom2.dipole.z*atom1.inducedDipole.y;
-    real dkxui_1 = atom2.dipole.z*atom1.inducedDipole.x - atom2.dipole.x*atom1.inducedDipole.z; 
-    real dkxui_2 = atom2.dipole.x*atom1.inducedDipole.y - atom2.dipole.y*atom1.inducedDipole.x;
-    real dkxuip_0 = atom2.dipole.y*atom1.inducedDipolePolar.z - atom2.dipole.z*atom1.inducedDipolePolar.y;
-    real dkxuip_1 = atom2.dipole.z*atom1.inducedDipolePolar.x - atom2.dipole.x*atom1.inducedDipolePolar.z;
-    real dkxuip_2 = atom2.dipole.x*atom1.inducedDipolePolar.y - atom2.dipole.y*atom1.inducedDipolePolar.x;
-    real ttm3_0 =  rr3*dixdk_0 + gf3*dkxr_0;
-    real ttm3_1 =  rr3*dixdk_1 + gf3*dkxr_1;
-    real ttm3_2 =  rr3*dixdk_2 + gf3*dkxr_2;
-    real ttm3i_0 = -(dkxui_0*psc3+ dkxuip_0*dsc3)*0.5f + gti3*dkxr_0;
-    real ttm3i_1 = -(dkxui_1*psc3+ dkxuip_1*dsc3)*0.5f + gti3*dkxr_1;
-    real ttm3i_2 = -(dkxui_2*psc3+ dkxuip_2*dsc3)*0.5f + gti3*dkxr_2;
-#endif
-    if (mScale < 1) {
-#ifdef T1
-        ttm2_0 *= mScale;
-        ttm2_1 *= mScale;
-        ttm2_2 *= mScale;
-#endif
-#ifdef T3
-        ttm3_0 *= mScale;
-        ttm3_1 *= mScale;
-        ttm3_2 *= mScale;
-#endif
-    }
-#ifdef F1
-    outputForce.x = -(ftm2_0+ftm2i_0);
-    outputForce.y = -(ftm2_1+ftm2i_1);
-    outputForce.z = -(ftm2_2+ftm2i_2);
-#endif
-#ifdef T1
-    outputForce.x = (ttm2_0 + ttm2i_0);
-    outputForce.y = (ttm2_1 + ttm2i_1);
-    outputForce.z = (ttm2_2 + ttm2i_2);
-#endif
-#ifdef T3
-    outputForce.x = (ttm3_0 + ttm3i_0);
-    outputForce.y = (ttm3_1 + ttm3i_1);
-    outputForce.z = (ttm3_2 + ttm3i_2);
-#endif
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
 typedef struct {
-    real4 posq;
+    real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
-    real3 force, dipole, inducedDipole, inducedDipolePolar;
+    real q;
+    float thole, damp;
 #ifdef INCLUDE_QUADRUPOLES
-    real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
+    real sphericalQuadrupole[5];
-    real quadrupoleYY, quadrupoleYZ;
 #endif
-    float thole, damp;
 } AtomData;
-__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real& energy, real3& outputForce);
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ sphericalDipole,
-__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce);
+        const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
-__device__ void computeOneInteractionT3(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce);
+    real4 atomPosq = posq[atom];
+    data.pos = make_real3(atomPosq.x, atomPosq.y, atomPosq.z);
-inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
+    data.q = atomPosq.w;
-        const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+    data.sphericalDipole.x = sphericalDipole[atom*3];
-    data.posq = posq[atom];
+    data.sphericalDipole.y = sphericalDipole[atom*3+1];
-    data.dipole.x = labFrameDipole[atom*3];
+    data.sphericalDipole.z = sphericalDipole[atom*3+2];
-    data.dipole.y = labFrameDipole[atom*3+1];
-    data.dipole.z = labFrameDipole[atom*3+2];
 #ifdef INCLUDE_QUADRUPOLES
-    data.quadrupoleXX = labFrameQuadrupole[atom*5];
+    data.sphericalQuadrupole[0] = sphericalQuadrupole[atom*5];
-    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
+    data.sphericalQuadrupole[1] = sphericalQuadrupole[atom*5+1];
-    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
+    data.sphericalQuadrupole[2] = sphericalQuadrupole[atom*5+2];
-    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
+    data.sphericalQuadrupole[3] = sphericalQuadrupole[atom*5+3];
-    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
+    data.sphericalQuadrupole[4] = sphericalQuadrupole[atom*5+4];
 #endif
    data.inducedDipole.x = inducedDipole[atom*3];
    data.inducedDipole.y = inducedDipole[atom*3+1];
@@ -57,6 +54,322 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
    return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f));
 }
+__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor, real& energy) {
+    // Compute the displacement.
+    real3 delta;
+    delta.x = atom2.pos.x - atom1.pos.x;
+    delta.y = atom2.pos.y - atom1.pos.y;
+    delta.z = atom2.pos.z - atom1.pos.z;
+    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+    real rInv = RSQRT(r2);
+    real r = r2*rInv;
+    // Rotate the various dipoles and quadrupoles.
+    real qiRotationMatrix[3][3];
+    buildQIRotationMatrix(delta, rInv, qiRotationMatrix);
+    real3 qiUindI = 0.5f*make_real3(qiRotationMatrix[0][1]*atom1.inducedDipole.x + qiRotationMatrix[0][2]*atom1.inducedDipole.y + qiRotationMatrix[0][0]*atom1.inducedDipole.z,
+                                    qiRotationMatrix[1][1]*atom1.inducedDipole.x + qiRotationMatrix[1][2]*atom1.inducedDipole.y + qiRotationMatrix[1][0]*atom1.inducedDipole.z,
+                                    qiRotationMatrix[2][1]*atom1.inducedDipole.x + qiRotationMatrix[2][2]*atom1.inducedDipole.y + qiRotationMatrix[2][0]*atom1.inducedDipole.z);
+    real3 qiUindJ = 0.5f*make_real3(qiRotationMatrix[0][1]*atom2.inducedDipole.x + qiRotationMatrix[0][2]*atom2.inducedDipole.y + qiRotationMatrix[0][0]*atom2.inducedDipole.z,
+                                    qiRotationMatrix[1][1]*atom2.inducedDipole.x + qiRotationMatrix[1][2]*atom2.inducedDipole.y + qiRotationMatrix[1][0]*atom2.inducedDipole.z,
+                                    qiRotationMatrix[2][1]*atom2.inducedDipole.x + qiRotationMatrix[2][2]*atom2.inducedDipole.y + qiRotationMatrix[2][0]*atom2.inducedDipole.z);
+    real3 qiUinpI = 0.5f*make_real3(qiRotationMatrix[0][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[0][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[0][0]*atom1.inducedDipolePolar.z,
+                                    qiRotationMatrix[1][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[1][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[1][0]*atom1.inducedDipolePolar.z,
+                                    qiRotationMatrix[2][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[2][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[2][0]*atom1.inducedDipolePolar.z);
+    real3 qiUinpJ = 0.5f*make_real3(qiRotationMatrix[0][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[0][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[0][0]*atom2.inducedDipolePolar.z,
+                                    qiRotationMatrix[1][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[1][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[1][0]*atom2.inducedDipolePolar.z,
+                                    qiRotationMatrix[2][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[2][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[2][0]*atom2.inducedDipolePolar.z);
+    real3 rotatedDipole1 = rotateDipole(atom1.sphericalDipole, qiRotationMatrix);
+    real3 rotatedDipole2 = rotateDipole(atom2.sphericalDipole, qiRotationMatrix);
+    real rotatedQuadrupole1[] = {0, 0, 0, 0, 0};
+    real rotatedQuadrupole2[] = {0, 0, 0, 0, 0};
+#ifdef INCLUDE_QUADRUPOLES
+    rotateQuadupoles(qiRotationMatrix, atom1.sphericalQuadrupole, atom2.sphericalQuadrupole, rotatedQuadrupole1, rotatedQuadrupole2);
+#endif    
+    // The field derivatives at I due to permanent and induced moments on J, and vice-versa.
+    // Also, their derivatives w.r.t. R, which are needed for force calculations
+    real Vij[9], Vji[9], VjiR[9], VijR[9];
+    // The field derivatives at I due to only permanent moments on J, and vice-versa.
+    real Vijp[3], Vijd[3], Vjip[3], Vjid[3];
+    real rInvVec[7];
+    // The rInvVec array is defined such that the ith element is R^-i, with the
+    // dieleectric constant folded in, to avoid conversions later.
+    rInvVec[1] = rInv;
+    for (int i = 2; i < 7; ++i)
+        rInvVec[i] = rInvVec[i-1] * rInv;
+    real dmp = atom1.damp*atom2.damp;
+    real a = min(atom1.thole, atom2.thole);
+    real u = fabs(dmp) > 1.0e-5f ? r/dmp : 1e10f;
+    real au3 = a*u*u*u;
+    real expau3 = au3 < 50 ? EXP(-au3) : 0;
+    real a2u6 = au3*au3;
+    real a3u9 = a2u6*au3;
+    // Thole damping factors for energies
+    real thole_c  = 1 - expau3;
+    real thole_d0 = 1 - expau3*(1 + 1.5f*au3);
+    real thole_d1 = 1 - expau3;
+    real thole_q0 = 1 - expau3*(1 + au3 + a2u6);
+    real thole_q1 = 1 - expau3*(1 + au3);
+    // Thole damping factors for derivatives
+    real dthole_c  = 1 - expau3*(1 + 1.5f*au3);
+    real dthole_d0 = 1 - expau3*(1 + au3 + 1.5f*a2u6);
+    real dthole_d1 = 1 - expau3*(1 + au3);
+    real dthole_q0 = 1 - expau3*(1 + au3 + 0.25f*a2u6 + 0.75f*a3u9);
+    real dthole_q1 = 1 - expau3*(1 + au3 + 0.75f*a2u6);
+    // Now we compute the (attenuated) Coulomb operator and its derivatives, contracted with
+    // permanent moments and induced dipoles.  Note that the coefficient of the permanent force
+    // terms is half of the expected value; this is because we compute the interaction of I with
+    // the sum of induced and permanent moments on J, as well as the interaction of J with I's
+    // permanent and induced moments; doing so double counts the permanent-permanent interaction.
+    real ePermCoef, dPermCoef, eUIndCoef, dUIndCoef, eUInpCoef, dUInpCoef;
+    // C-C terms (m=0)
+    ePermCoef = rInvVec[1]*mScale;
+    dPermCoef = -0.5f*mScale*rInvVec[2];
+    Vij[0]  = ePermCoef*atom2.q;
+    Vji[0]  = ePermCoef*atom1.q;
+    VijR[0] = dPermCoef*atom2.q;
+    VjiR[0] = dPermCoef*atom1.q;
+    // C-D and C-Uind terms (m=0)
+    ePermCoef = rInvVec[2]*mScale;
+    eUIndCoef = rInvVec[2]*pScale*thole_c;
+    eUInpCoef = rInvVec[2]*dScale*thole_c;
+    dPermCoef = -rInvVec[3]*mScale;
+    dUIndCoef = -2*rInvVec[3]*pScale*dthole_c;
+    dUInpCoef = -2*rInvVec[3]*dScale*dthole_c;
+    Vij[0]  += -(ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x);
+    Vji[1]   = -(ePermCoef*atom1.q);
+    VijR[0] += -(dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x);
+    VjiR[1]  = -(dPermCoef*atom1.q);
+    Vjip[0]  = -(eUInpCoef*atom1.q);
+    Vjid[0]  = -(eUIndCoef*atom1.q);
+    // D-C and Uind-C terms (m=0)
+    Vij[1]   = ePermCoef*atom2.q;
+    Vji[0]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1]  = dPermCoef*atom2.q;
+    VjiR[0] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0]  = eUInpCoef*atom2.q;
+    Vijd[0]  = eUIndCoef*atom2.q;
+    // D-D and D-Uind terms (m=0)
+    ePermCoef = -2*rInvVec[3]*mScale;
+    eUIndCoef = -2*rInvVec[3]*pScale*thole_d0;
+    eUInpCoef = -2*rInvVec[3]*dScale*thole_d0;
+    dPermCoef = 3*rInvVec[4]*mScale;
+    dUIndCoef = 6*rInvVec[4]*pScale*dthole_d0;
+    dUInpCoef = 6*rInvVec[4]*dScale*dthole_d0;
+    Vij[1]  += ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x;
+    Vji[1]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1] += dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x;
+    VjiR[1] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0] += eUInpCoef*rotatedDipole2.x;
+    Vijd[0] += eUIndCoef*rotatedDipole2.x;
+    Vjip[0] += eUInpCoef*rotatedDipole1.x;
+    Vjid[0] += eUIndCoef*rotatedDipole1.x;
+    // D-D and D-Uind terms (m=1)
+    ePermCoef = rInvVec[3]*mScale;
+    eUIndCoef = rInvVec[3]*pScale*thole_d1;
+    eUInpCoef = rInvVec[3]*dScale*thole_d1;
+    dPermCoef = -1.5f*rInvVec[4]*mScale;
+    dUIndCoef = -3*rInvVec[4]*pScale*dthole_d1;
+    dUInpCoef = -3*rInvVec[4]*dScale*dthole_d1;
+    Vij[2]  = ePermCoef*rotatedDipole2.y + eUIndCoef*qiUindJ.y + eUInpCoef*qiUinpJ.y;
+    Vji[2]  = ePermCoef*rotatedDipole1.y + eUIndCoef*qiUindI.y + eUInpCoef*qiUinpI.y;
+    VijR[2] = dPermCoef*rotatedDipole2.y + dUIndCoef*qiUindJ.y + dUInpCoef*qiUinpJ.y;
+    VjiR[2] = dPermCoef*rotatedDipole1.y + dUIndCoef*qiUindI.y + dUInpCoef*qiUinpI.y;
+    Vij[3]  = ePermCoef*rotatedDipole2.z + eUIndCoef*qiUindJ.z + eUInpCoef*qiUinpJ.z;
+    Vji[3]  = ePermCoef*rotatedDipole1.z + eUIndCoef*qiUindI.z + eUInpCoef*qiUinpI.z;
+    VijR[3] = dPermCoef*rotatedDipole2.z + dUIndCoef*qiUindJ.z + dUInpCoef*qiUinpJ.z;
+    VjiR[3] = dPermCoef*rotatedDipole1.z + dUIndCoef*qiUindI.z + dUInpCoef*qiUinpI.z;
+    Vijp[1] = eUInpCoef*rotatedDipole2.y;
+    Vijd[1] = eUIndCoef*rotatedDipole2.y;
+    Vjip[1] = eUInpCoef*rotatedDipole1.y;
+    Vjid[1] = eUIndCoef*rotatedDipole1.y;
+    Vijp[2] = eUInpCoef*rotatedDipole2.z;
+    Vijd[2] = eUIndCoef*rotatedDipole2.z;
+    Vjip[2] = eUInpCoef*rotatedDipole1.z;
+    Vjid[2] = eUIndCoef*rotatedDipole1.z;
+    // C-Q terms (m=0)
+    ePermCoef = mScale*rInvVec[3];
+    dPermCoef = -1.5f*rInvVec[4]*mScale;
+    Vij[0]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]   = ePermCoef*atom1.q;
+    VijR[0] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4]  = dPermCoef*atom1.q;
+    // Q-C terms (m=0)
+    Vij[4]   = ePermCoef*atom2.q;
+    Vji[0]  += ePermCoef*rotatedQuadrupole1[0];
+    VijR[4]  = dPermCoef*atom2.q;
+    VjiR[0] += dPermCoef*rotatedQuadrupole1[0];
+    // D-Q and Uind-Q terms (m=0)
+    ePermCoef = rInvVec[4]*3.0*mScale;
+    eUIndCoef = rInvVec[4]*3.0*pScale*thole_q0;
+    eUInpCoef = rInvVec[4]*3.0*dScale*thole_q0;
+    dPermCoef = -6*rInvVec[5]*mScale;
+    dUIndCoef = -12*rInvVec[5]*pScale*dthole_q0;
+    dUInpCoef = -12*rInvVec[5]*dScale*dthole_q0;
+    Vij[1]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0] += eUInpCoef*rotatedQuadrupole2[0];
+    Vijd[0] += eUIndCoef*rotatedQuadrupole2[0];
+    // Q-D and Q-Uind terms (m=0)
+    Vij[4]  += -(ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x);
+    Vji[1]  += -(ePermCoef*rotatedQuadrupole1[0]);
+    VijR[4] += -(dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x);
+    VjiR[1] += -(dPermCoef*rotatedQuadrupole1[0]);
+    Vjip[0] += -(eUInpCoef*rotatedQuadrupole1[0]);
+    Vjid[0] += -(eUIndCoef*rotatedQuadrupole1[0]);
+    // D-Q and Uind-Q terms (m=1)
+    const real sqrtThree = SQRT((real) 3);
+    ePermCoef = -sqrtThree*rInvVec[4]*mScale;
+    eUIndCoef = -sqrtThree*rInvVec[4]*pScale*thole_q1;
+    eUInpCoef = -sqrtThree*rInvVec[4]*dScale*thole_q1;
+    dPermCoef = 2*sqrtThree*rInvVec[5]*mScale;
+    dUIndCoef = 4*sqrtThree*rInvVec[5]*pScale*dthole_q1;
+    dUInpCoef = 4*sqrtThree*rInvVec[5]*dScale*dthole_q1;
+    Vij[2]  += ePermCoef*rotatedQuadrupole2[1];
+    Vji[5]   = ePermCoef*rotatedDipole1.y + eUIndCoef*qiUindI.y + eUInpCoef*qiUinpI.y;
+    VijR[2] += dPermCoef*rotatedQuadrupole2[1];
+    VjiR[5]  = dPermCoef*rotatedDipole1.y + dUIndCoef*qiUindI.y + dUInpCoef*qiUinpI.y;
+    Vij[3]  += ePermCoef*rotatedQuadrupole2[2];
+    Vji[6]   = ePermCoef*rotatedDipole1.z + eUIndCoef*qiUindI.z + eUInpCoef*qiUinpI.z;
+    VijR[3] += dPermCoef*rotatedQuadrupole2[2];
+    VjiR[6]  = dPermCoef*rotatedDipole1.z + dUIndCoef*qiUindI.z + dUInpCoef*qiUinpI.z;
+    Vijp[1] += eUInpCoef*rotatedQuadrupole2[1];
+    Vijd[1] += eUIndCoef*rotatedQuadrupole2[1];
+    Vijp[2] += eUInpCoef*rotatedQuadrupole2[2];
+    Vijd[2] += eUIndCoef*rotatedQuadrupole2[2];
+    // D-Q and Uind-Q terms (m=1)
+    Vij[5]   = -(ePermCoef*rotatedDipole2.y + eUIndCoef*qiUindJ.y + eUInpCoef*qiUinpJ.y);
+    Vji[2]  += -(ePermCoef*rotatedQuadrupole1[1]);
+    VijR[5]  = -(dPermCoef*rotatedDipole2.y + dUIndCoef*qiUindJ.y + dUInpCoef*qiUinpJ.y);
+    VjiR[2] += -(dPermCoef*rotatedQuadrupole1[1]);
+    Vij[6]   = -(ePermCoef*rotatedDipole2.z + eUIndCoef*qiUindJ.z + eUInpCoef*qiUinpJ.z);
+    Vji[3]  += -(ePermCoef*rotatedQuadrupole1[2]);
+    VijR[6]  = -(dPermCoef*rotatedDipole2.z + dUIndCoef*qiUindJ.z + dUInpCoef*qiUinpJ.z);
+    VjiR[3] += -(dPermCoef*rotatedQuadrupole1[2]);
+    Vjip[1] += -(eUInpCoef*rotatedQuadrupole1[1]);
+    Vjid[1] += -(eUIndCoef*rotatedQuadrupole1[1]);
+    Vjip[2] += -(eUInpCoef*rotatedQuadrupole1[2]);
+    Vjid[2] += -(eUIndCoef*rotatedQuadrupole1[2]);
+    // Q-Q terms (m=0)
+    ePermCoef = 6*rInvVec[5]*mScale;
+    dPermCoef = -15*rInvVec[6]*mScale;
+    Vij[4]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]  += ePermCoef*rotatedQuadrupole1[0];
+    VijR[4] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4] += dPermCoef*rotatedQuadrupole1[0];
+    // Q-Q terms (m=1)
+    ePermCoef = -4*rInvVec[5]*mScale;
+    dPermCoef = 10*rInvVec[6]*mScale;
+    Vij[5]  += ePermCoef*rotatedQuadrupole2[1];
+    Vji[5]  += ePermCoef*rotatedQuadrupole1[1];
+    VijR[5] += dPermCoef*rotatedQuadrupole2[1];
+    VjiR[5] += dPermCoef*rotatedQuadrupole1[1];
+    Vij[6]  += ePermCoef*rotatedQuadrupole2[2];
+    Vji[6]  += ePermCoef*rotatedQuadrupole1[2];
+    VijR[6] += dPermCoef*rotatedQuadrupole2[2];
+    VjiR[6] += dPermCoef*rotatedQuadrupole1[2];
+    // Q-Q terms (m=2)
+    ePermCoef = rInvVec[5]*mScale;
+    dPermCoef = -2.5f*rInvVec[6]*mScale;
+    Vij[7]  = ePermCoef*rotatedQuadrupole2[3];
+    Vji[7]  = ePermCoef*rotatedQuadrupole1[3];
+    VijR[7] = dPermCoef*rotatedQuadrupole2[3];
+    VjiR[7] = dPermCoef*rotatedQuadrupole1[3];
+    Vij[8]  = ePermCoef*rotatedQuadrupole2[4];
+    Vji[8]  = ePermCoef*rotatedQuadrupole1[4];
+    VijR[8] = dPermCoef*rotatedQuadrupole2[4];
+    VjiR[8] = dPermCoef*rotatedQuadrupole1[4];
+    // Evaluate the energies, forces and torques due to permanent+induced moments
+    // interacting with just the permanent moments.
+    energy += forceFactor*0.5f*(
+        atom1.q*Vij[0] + rotatedDipole1.x*Vij[1] + rotatedDipole1.y*Vij[2] + rotatedDipole1.z*Vij[3] + rotatedQuadrupole1[0]*Vij[4] + rotatedQuadrupole1[1]*Vij[5] + rotatedQuadrupole1[2]*Vij[6] + rotatedQuadrupole1[3]*Vij[7] + rotatedQuadrupole1[4]*Vij[8] +
+        atom2.q*Vji[0] + rotatedDipole2.x*Vji[1] + rotatedDipole2.y*Vji[2] + rotatedDipole2.z*Vji[3] + rotatedQuadrupole2[0]*Vji[4] + rotatedQuadrupole2[1]*Vji[5] + rotatedQuadrupole2[2]*Vji[6] + rotatedQuadrupole2[3]*Vji[7] + rotatedQuadrupole2[4]*Vji[8]);
+    real fIZ = atom1.q*VijR[0] + rotatedDipole1.x*VijR[1] + rotatedDipole1.y*VijR[2] + rotatedDipole1.z*VijR[3] + rotatedQuadrupole1[0]*VijR[4] + rotatedQuadrupole1[1]*VijR[5] + rotatedQuadrupole1[2]*VijR[6] + rotatedQuadrupole1[3]*VijR[7] + rotatedQuadrupole1[4]*VijR[8];
+    real fJZ = atom2.q*VjiR[0] + rotatedDipole2.x*VjiR[1] + rotatedDipole2.y*VjiR[2] + rotatedDipole2.z*VjiR[3] + rotatedQuadrupole2[0]*VjiR[4] + rotatedQuadrupole2[1]*VjiR[5] + rotatedQuadrupole2[2]*VjiR[6] + rotatedQuadrupole2[3]*VjiR[7] + rotatedQuadrupole2[4]*VjiR[8];
+    real EIX = rotatedDipole1.z*Vij[1] - rotatedDipole1.x*Vij[3] + sqrtThree*rotatedQuadrupole1[2]*Vij[4] + rotatedQuadrupole1[4]*Vij[5] - (sqrtThree*rotatedQuadrupole1[0]+rotatedQuadrupole1[3])*Vij[6] + rotatedQuadrupole1[2]*Vij[7] - rotatedQuadrupole1[1]*Vij[8];
+    real EIY = -rotatedDipole1.y*Vij[1] + rotatedDipole1.x*Vij[2] - sqrtThree*rotatedQuadrupole1[1]*Vij[4] + (sqrtThree*rotatedQuadrupole1[0]-rotatedQuadrupole1[3])*Vij[5] - rotatedQuadrupole1[4]*Vij[6] + rotatedQuadrupole1[1]*Vij[7] + rotatedQuadrupole1[2]*Vij[8];
+    real EIZ = -rotatedDipole1.z*Vij[2] + rotatedDipole1.y*Vij[3] - rotatedQuadrupole1[2]*Vij[5] + rotatedQuadrupole1[1]*Vij[6] - 2*rotatedQuadrupole1[4]*Vij[7] + 2*rotatedQuadrupole1[3]*Vij[8];
+    real EJX = rotatedDipole2.z*Vji[1] - rotatedDipole2.x*Vji[3] + sqrtThree*rotatedQuadrupole2[2]*Vji[4] + rotatedQuadrupole2[4]*Vji[5] - (sqrtThree*rotatedQuadrupole2[0]+rotatedQuadrupole2[3])*Vji[6] + rotatedQuadrupole2[2]*Vji[7] - rotatedQuadrupole2[1]*Vji[8];
+    real EJY = -rotatedDipole2.y*Vji[1] + rotatedDipole2.x*Vji[2] - sqrtThree*rotatedQuadrupole2[1]*Vji[4] + (sqrtThree*rotatedQuadrupole2[0]-rotatedQuadrupole2[3])*Vji[5] - rotatedQuadrupole2[4]*Vji[6] + rotatedQuadrupole2[1]*Vji[7] + rotatedQuadrupole2[2]*Vji[8];
+    real EJZ = -rotatedDipole2.z*Vji[2] + rotatedDipole2.y*Vji[3] - rotatedQuadrupole2[2]*Vji[5] + rotatedQuadrupole2[1]*Vji[6] - 2*rotatedQuadrupole2[4]*Vji[7] + 2*rotatedQuadrupole2[3]*Vji[8];
+    // Define the torque intermediates for the induced dipoles. These are simply the induced dipole torque
+    // intermediates dotted with the field due to permanent moments only, at each center. We inline the
+    // induced dipole torque intermediates here, for simplicity. N.B. There are no torques on the dipoles
+    // themselves, so we accumulate the torque intermediates into separate variables to allow them to be
+    // used only in the force calculation.
+    //
+    // The torque about the x axis (needed to obtain the y force on the induced dipoles, below)
+    //    qiUindIx[0] = qiQUindI[2];    qiUindIx[1] = 0;    qiUindIx[2] = -qiQUindI[0]
+    real iEIX = qiUinpI.z*Vijp[0] + qiUindI.z*Vijd[0] - qiUinpI.x*Vijp[2] - qiUindI.x*Vijd[2];
+    real iEJX = qiUinpJ.z*Vjip[0] + qiUindJ.z*Vjid[0] - qiUinpJ.x*Vjip[2] - qiUindJ.x*Vjid[2];
+    // The torque about the y axis (needed to obtain the x force on the induced dipoles, below)
+    //    qiUindIy[0] = -qiQUindI[1];   qiUindIy[1] = qiQUindI[0];    qiUindIy[2] = 0
+    real iEIY = qiUinpI.x*Vijp[1] + qiUindI.x*Vijd[1] - qiUinpI.y*Vijp[0] - qiUindI.y*Vijd[0];
+    real iEJY = qiUinpJ.x*Vjip[1] + qiUindJ.x*Vjid[1] - qiUinpJ.y*Vjip[0] - qiUindJ.y*Vjid[0];
+#ifdef USE_MUTUAL_POLARIZATION
+    // Uind-Uind terms (m=0)
+    real eCoef = -4*rInvVec[3]*thole_d0;
+    real dCoef = 6*rInvVec[4]*dthole_d0;
+    iEIX += eCoef*(qiUinpI.z*qiUindJ.x + qiUindI.z*qiUinpJ.x);
+    iEJX += eCoef*(qiUinpJ.z*qiUindI.x + qiUindJ.z*qiUinpI.x);
+    iEIY -= eCoef*(qiUinpI.y*qiUindJ.x + qiUindI.y*qiUinpJ.x);
+    iEJY -= eCoef*(qiUinpJ.y*qiUindI.x + qiUindJ.y*qiUinpI.x);
+    fIZ += dCoef*(qiUinpI.x*qiUindJ.x + qiUindI.x*qiUinpJ.x);
+    fIZ += dCoef*(qiUinpJ.x*qiUindI.x + qiUindJ.x*qiUinpI.x);
+    // Uind-Uind terms (m=1)
+    eCoef = 2*rInvVec[3]*thole_d1;
+    dCoef = -3*rInvVec[4]*dthole_d1;
+    iEIX -= eCoef*(qiUinpI.x*qiUindJ.z + qiUindI.x*qiUinpJ.z);
+    iEJX -= eCoef*(qiUinpJ.x*qiUindI.z + qiUindJ.x*qiUinpI.z);
+    iEIY += eCoef*(qiUinpI.x*qiUindJ.y + qiUindI.x*qiUinpJ.y);
+    iEJY += eCoef*(qiUinpJ.x*qiUindI.y + qiUindJ.x*qiUinpI.y);
+    fIZ += dCoef*(qiUinpI.y*qiUindJ.y + qiUindI.y*qiUinpJ.y + qiUinpI.z*qiUindJ.z + qiUindI.z*qiUinpJ.z);
+    fIZ += dCoef*(qiUinpJ.y*qiUindI.y + qiUindJ.y*qiUinpI.y + qiUinpJ.z*qiUindI.z + qiUindJ.z*qiUinpI.z);
+#endif
+    // The quasi-internal frame forces and torques.  Note that the induced torque intermediates are
+    // used in the force expression, but not in the torques; the induced dipoles are isotropic.
+    real qiForce[3] = {rInv*(EIY+EJY+iEIY+iEJY), -rInv*(EIX+EJX+iEIX+iEJX), -(fJZ+fIZ)};
+    real qiTorqueI[3] = {-EIX, -EIY, -EIZ};
+    real qiTorqueJ[3] = {-EJX, -EJY, -EJZ};
+    real3 force = make_real3(qiRotationMatrix[1][1]*qiForce[0] + qiRotationMatrix[2][1]*qiForce[1] + qiRotationMatrix[0][1]*qiForce[2],
+                             qiRotationMatrix[1][2]*qiForce[0] + qiRotationMatrix[2][2]*qiForce[1] + qiRotationMatrix[0][2]*qiForce[2],
+                             qiRotationMatrix[1][0]*qiForce[0] + qiRotationMatrix[2][0]*qiForce[1] + qiRotationMatrix[0][0]*qiForce[2]);
+    atom1.force += force;
+    atom1.torque += make_real3(qiRotationMatrix[1][1]*qiTorqueI[0] + qiRotationMatrix[2][1]*qiTorqueI[1] + qiRotationMatrix[0][1]*qiTorqueI[2],
+                               qiRotationMatrix[1][2]*qiTorqueI[0] + qiRotationMatrix[2][2]*qiTorqueI[1] + qiRotationMatrix[0][2]*qiTorqueI[2],
+                               qiRotationMatrix[1][0]*qiTorqueI[0] + qiRotationMatrix[2][0]*qiTorqueI[1] + qiRotationMatrix[0][0]*qiTorqueI[2]);
+    if (forceFactor == 1) {
+        atom2.force -= force;
+        atom2.torque += make_real3(qiRotationMatrix[1][1]*qiTorqueJ[0] + qiRotationMatrix[2][1]*qiTorqueJ[1] + qiRotationMatrix[0][1]*qiTorqueJ[2],
+                                   qiRotationMatrix[1][2]*qiTorqueJ[0] + qiRotationMatrix[2][2]*qiTorqueJ[1] + qiRotationMatrix[0][2]*qiTorqueJ[2],
+                                   qiRotationMatrix[1][0]*qiTorqueJ[0] + qiRotationMatrix[2][0]*qiTorqueJ[1] + qiRotationMatrix[0][0]*qiTorqueJ[2]);
+    }
+}
 /**
 * Compute electrostatic interactions.
 */
@@ -69,7 +382,7 @@ extern "C" __global__ void computeElectrostatics(
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const unsigned int* __restrict__ interactingAtoms,
 #endif
-        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
+        const real* __restrict__ sphericalDipole, const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole,
        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
@@ -77,7 +390,6 @@ extern "C" __global__ void computeElectrostatics(
    const unsigned int tbx = threadIdx.x - tgx;
    real energy = 0;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
    // First loop: process tiles that contain exclusions.
@@ -89,21 +401,23 @@ extern "C" __global__ void computeElectrostatics(
        const unsigned int y = tileIndices.y;
        AtomData data;
        unsigned int atom1 = x*TILE_SIZE + tgx;
-        loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+        loadAtomData(data, atom1, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
        data.force = make_real3(0);
+        data.torque = make_real3(0);
        uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
        unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
        if (x == y) {
            // This tile is on the diagonal.
-            localData[threadIdx.x].posq = data.posq;
+            localData[threadIdx.x].pos = data.pos;
-            localData[threadIdx.x].dipole = data.dipole;
+            localData[threadIdx.x].q = data.q;
+            localData[threadIdx.x].sphericalDipole = data.sphericalDipole;
 #ifdef INCLUDE_QUADRUPOLES
-            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
+            localData[threadIdx.x].sphericalQuadrupole[0] = data.sphericalQuadrupole[0];
-            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
+            localData[threadIdx.x].sphericalQuadrupole[1] = data.sphericalQuadrupole[1];
-            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
+            localData[threadIdx.x].sphericalQuadrupole[2] = data.sphericalQuadrupole[2];
-            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
+            localData[threadIdx.x].sphericalQuadrupole[3] = data.sphericalQuadrupole[3];
-            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+            localData[threadIdx.x].sphericalQuadrupole[4] = data.sphericalQuadrupole[4];
 #endif
            localData[threadIdx.x].inducedDipole = data.inducedDipole;
            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
@@ -115,101 +429,57 @@ extern "C" __global__ void computeElectrostatics(
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = y*TILE_SIZE+j;
                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    real tempEnergy;
                    float d = computeDScaleFactor(polarizationGroup, j);
                    float p = computePScaleFactor(covalent, polarizationGroup, j);
                    float m = computeMScaleFactor(covalent, j);
-                    computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
+                    computeOneInteraction(data, localData[tbx+j], true, d, p, m, 0.5f, energy);
-                    data.force += tempForce;
-                    energy += 0.5f*tempEnergy;
                }
            }
-            data.force *= ENERGY_SCALE_FACTOR;
+            data.force *= -ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
-            // Compute torques.
+            atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
-            data.force = make_real3(0);
-            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                int atom2 = y*TILE_SIZE+j;
-                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    float d = computeDScaleFactor(polarizationGroup, j);
-                    float p = computePScaleFactor(covalent, polarizationGroup, j);
-                    float m = computeMScaleFactor(covalent, j);
-                    computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
-                    data.force += tempForce;
-                }
-            }
-            data.force *= ENERGY_SCALE_FACTOR;
-            atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
        }
        else {
            // This is an off-diagonal tile.
            unsigned int j = y*TILE_SIZE + tgx;
-            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], j, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            localData[threadIdx.x].force = make_real3(0);
+            localData[threadIdx.x].torque = make_real3(0);
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = y*TILE_SIZE+tj;
                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    real tempEnergy;
                    float d = computeDScaleFactor(polarizationGroup, tj);
                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
                    float m = computeMScaleFactor(covalent, tj);
-                    computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
+                    computeOneInteraction(data, localData[tbx+tj], true, d, p, m, 1, energy);
-                    data.force += tempForce;
-                    localData[tbx+tj].force -= tempForce;
-                    energy += tempEnergy;
                }
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
-            data.force *= ENERGY_SCALE_FACTOR;
+            data.force *= -ENERGY_SCALE_FACTOR;
-            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
            unsigned int offset = x*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
            offset = y*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
-            // Compute torques.
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
-            data.force = make_real3(0);
-            localData[threadIdx.x].force = make_real3(0);
-            for (j = 0; j < TILE_SIZE; j++) {
-                int atom2 = y*TILE_SIZE+tj;
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    float d = computeDScaleFactor(polarizationGroup, tj);
-                    float p = computePScaleFactor(covalent, polarizationGroup, tj);
-                    float m = computeMScaleFactor(covalent, tj);
-                    computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
-                    data.force += tempForce;
-                    computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
-                    localData[tbx+tj].force += tempForce;
-                }
-                tj = (tj + 1) & (TILE_SIZE - 1);
-            }
-            data.force *= ENERGY_SCALE_FACTOR;
-            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-            offset = x*TILE_SIZE + tgx;
-            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
        }
    }
@@ -272,16 +542,18 @@ extern "C" __global__ void computeElectrostatics(
            // Load atom data for this tile.
            AtomData data;
-            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(data, atom1, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            data.force = make_real3(0);
+            data.torque = make_real3(0);
 #ifdef USE_CUTOFF
            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
            atomIndices[threadIdx.x] = j;
-            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], j, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            localData[threadIdx.x].force = make_real3(0);
+            localData[threadIdx.x].torque = make_real3(0);
            // Compute forces.
@@ -289,21 +561,24 @@ extern "C" __global__ void computeElectrostatics(
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = atomIndices[tbx+tj];
                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
+                    computeOneInteraction(data, localData[tbx+tj], false, 1, 1, 1, 1, energy);
-                    real tempEnergy;
-                    computeOneInteractionF1(data, localData[tbx+tj], 1, 1, 1, tempEnergy, tempForce);
-                    data.force += tempForce;
-                    localData[tbx+tj].force -= tempForce;
-                    energy += tempEnergy;
                }
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
-            data.force *= ENERGY_SCALE_FACTOR;
+            data.force *= -ENERGY_SCALE_FACTOR;
-            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+            data.torque *= ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
+            localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
+            // Write results.
            unsigned int offset = x*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
 #ifdef USE_CUTOFF
            offset = atomIndices[threadIdx.x];
 #else
@@ -312,36 +587,9 @@ extern "C" __global__ void computeElectrostatics(
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
-            // Compute torques.
+            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
+            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
-            data.force = make_real3(0);
-            localData[threadIdx.x].force = make_real3(0);
-            for (j = 0; j < TILE_SIZE; j++) {
-                int atom2 = y*TILE_SIZE+tj;
-                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                    real3 tempForce;
-                    computeOneInteractionT1(data, localData[tbx+tj], 1, 1, 1, tempForce);
-                    data.force += tempForce;
-                    computeOneInteractionT3(data, localData[tbx+tj], 1, 1, 1, tempForce);
-                    localData[tbx+tj].force += tempForce;
-                }
-                tj = (tj + 1) & (TILE_SIZE - 1);
-            }
-            data.force *= ENERGY_SCALE_FACTOR;
-            localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
-            offset = x*TILE_SIZE + tgx;
-            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
-#ifdef USE_CUTOFF
-            offset = atomIndices[threadIdx.x];
-#else
-            offset = y*TILE_SIZE + tgx;
-#endif
-            atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-            atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
        }
        pos++;
    }

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
@@ -196,14 +196,14 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
    // Transform the potential.
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        cphi[10*i] = fphi[20*i];
+        cphi[10*i] = fphi[i];
-        cphi[10*i+1] = a[0][0]*fphi[20*i+1] + a[0][1]*fphi[20*i+2] + a[0][2]*fphi[20*i+3];
+        cphi[10*i+1] = a[0][0]*fphi[i+NUM_ATOMS*1] + a[0][1]*fphi[i+NUM_ATOMS*2] + a[0][2]*fphi[i+NUM_ATOMS*3];
-        cphi[10*i+2] = a[1][0]*fphi[20*i+1] + a[1][1]*fphi[20*i+2] + a[1][2]*fphi[20*i+3];
+        cphi[10*i+2] = a[1][0]*fphi[i+NUM_ATOMS*1] + a[1][1]*fphi[i+NUM_ATOMS*2] + a[1][2]*fphi[i+NUM_ATOMS*3];
-        cphi[10*i+3] = a[2][0]*fphi[20*i+1] + a[2][1]*fphi[20*i+2] + a[2][2]*fphi[20*i+3];
+        cphi[10*i+3] = a[2][0]*fphi[i+NUM_ATOMS*1] + a[2][1]*fphi[i+NUM_ATOMS*2] + a[2][2]*fphi[i+NUM_ATOMS*3];
        for (int j = 0; j < 6; j++) {
            cphi[10*i+4+j] = 0;
            for (int k = 0; k < 6; k++)
-                cphi[10*i+4+j] += b[j][k]*fphi[20*i+4+k];
+                cphi[10*i+4+j] += b[j][k]*fphi[i+NUM_ATOMS*(4+k)];
        }
    }
 }
@@ -211,20 +211,32 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
 extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole,
        const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+#if __CUDA_ARCH__ < 500
    real array[PME_ORDER*PME_ORDER];
+#else
+    // We have shared memory to spare, and putting the workspace array there reduces the load on L2 cache.
+    __shared__ real sharedArray[PME_ORDER*PME_ORDER*64];
+    real* array = &sharedArray[PME_ORDER*PME_ORDER*threadIdx.x];
+#endif
    real4 theta1[PME_ORDER];
    real4 theta2[PME_ORDER];
    real4 theta3[PME_ORDER];
-    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
-    // the grid values.
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        int m = pmeAtomGridIndex[i].x;
        real4 pos = posq[m];
        pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
        pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
        pos -= periodicBoxVecX*floor(pos.x*recipBoxVecX.z+0.5f);
+        real atomCharge = pos.w;
+        real atomDipoleX = fracDipole[m*3];
+        real atomDipoleY = fracDipole[m*3+1];
+        real atomDipoleZ = fracDipole[m*3+2];
+        real atomQuadrupoleXX = fracQuadrupole[m*6];
+        real atomQuadrupoleXY = fracQuadrupole[m*6+1];
+        real atomQuadrupoleXZ = fracQuadrupole[m*6+2];
+        real atomQuadrupoleYY = fracQuadrupole[m*6+3];
+        real atomQuadrupoleYZ = fracQuadrupole[m*6+4];
+        real atomQuadrupoleZZ = fracQuadrupole[m*6+5];
        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.
@@ -271,16 +283,6 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
                    int index = ybase + zindex;
                    real4 v = theta3[iz];
-                    real atomCharge = pos.w;
-                    real atomDipoleX = fracDipole[m*3];
-                    real atomDipoleY = fracDipole[m*3+1];
-                    real atomDipoleZ = fracDipole[m*3+2];
-                    real atomQuadrupoleXX = fracQuadrupole[m*6];
-                    real atomQuadrupoleXY = fracQuadrupole[m*6+1];
-                    real atomQuadrupoleXZ = fracQuadrupole[m*6+2];
-                    real atomQuadrupoleYY = fracQuadrupole[m*6+3];
-                    real atomQuadrupoleYZ = fracQuadrupole[m*6+4];
-                    real atomQuadrupoleZZ = fracQuadrupole[m*6+5];
                    real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
                    real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
                    real term2 = atomQuadrupoleXX * u.x * v.x;
@@ -300,7 +302,13 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
 extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
        const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+#if __CUDA_ARCH__ < 500
    real array[PME_ORDER*PME_ORDER];
+#else
+    // We have shared memory to spare, and putting the workspace array there reduces the load on L2 cache.
+    __shared__ real sharedArray[PME_ORDER*PME_ORDER*64];
+    real* array = &sharedArray[PME_ORDER*PME_ORDER*threadIdx.x];
+#endif
    real4 theta1[PME_ORDER];
    real4 theta2[PME_ORDER];
    real4 theta3[PME_ORDER];
@@ -318,15 +326,19 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
    }
    __syncthreads();
-    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
-    // the grid values.
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        int m = pmeAtomGridIndex[i].x;
        real4 pos = posq[m];
        pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
        pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
        pos -= periodicBoxVecX*floor(pos.x*recipBoxVecX.z+0.5f);
+        real3 cinducedDipole = ((const real3*) inducedDipole)[m];
+        real3 cinducedDipolePolar = ((const real3*) inducedDipolePolar)[m];
+        real3 finducedDipole = make_real3(cinducedDipole.x*cartToFrac[0][0] + cinducedDipole.y*cartToFrac[0][1] + cinducedDipole.z*cartToFrac[0][2],
+                                          cinducedDipole.x*cartToFrac[1][0] + cinducedDipole.y*cartToFrac[1][1] + cinducedDipole.z*cartToFrac[1][2],
+                                          cinducedDipole.x*cartToFrac[2][0] + cinducedDipole.y*cartToFrac[2][1] + cinducedDipole.z*cartToFrac[2][2]);
+        real3 finducedDipolePolar = make_real3(cinducedDipolePolar.x*cartToFrac[0][0] + cinducedDipolePolar.y*cartToFrac[0][1] + cinducedDipolePolar.z*cartToFrac[0][2],
+                                               cinducedDipolePolar.x*cartToFrac[1][0] + cinducedDipolePolar.y*cartToFrac[1][1] + cinducedDipolePolar.z*cartToFrac[1][2],
+                                               cinducedDipolePolar.x*cartToFrac[2][0] + cinducedDipolePolar.y*cartToFrac[2][1] + cinducedDipolePolar.z*cartToFrac[2][2]);
        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.
@@ -373,14 +385,6 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
                    int index = ybase + zindex;
                    real4 v = theta3[iz];
-                    real3 cinducedDipole = make_real3(inducedDipole[m*3], inducedDipole[m*3+1], inducedDipole[m*3+2]);
-                    real3 cinducedDipolePolar = make_real3(inducedDipolePolar[m*3], inducedDipolePolar[m*3+1], inducedDipolePolar[m*3+2]);
-                    real3 finducedDipole = make_real3(cinducedDipole.x*cartToFrac[0][0] + cinducedDipole.y*cartToFrac[0][1] + cinducedDipole.z*cartToFrac[0][2],
-                                                      cinducedDipole.x*cartToFrac[1][0] + cinducedDipole.y*cartToFrac[1][1] + cinducedDipole.z*cartToFrac[1][2],
-                                                      cinducedDipole.x*cartToFrac[2][0] + cinducedDipole.y*cartToFrac[2][1] + cinducedDipole.z*cartToFrac[2][2]);
-                    real3 finducedDipolePolar = make_real3(cinducedDipolePolar.x*cartToFrac[0][0] + cinducedDipolePolar.y*cartToFrac[0][1] + cinducedDipolePolar.z*cartToFrac[0][2],
-                                                           cinducedDipolePolar.x*cartToFrac[1][0] + cinducedDipolePolar.y*cartToFrac[1][1] + cinducedDipolePolar.z*cartToFrac[1][2],
-                                                           cinducedDipolePolar.x*cartToFrac[2][0] + cinducedDipolePolar.y*cartToFrac[2][1] + cinducedDipolePolar.z*cartToFrac[2][2]);
                    real term01 = finducedDipole.y*u.y*v.x + finducedDipole.z*u.x*v.y;
                    real term11 = finducedDipole.x*u.x*v.x;
                    real term02 = finducedDipolePolar.y*u.y*v.x + finducedDipolePolar.z*u.x*v.y;
@@ -448,7 +452,13 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
        long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers,  const real4* __restrict__ posq,
        const real* __restrict__ labFrameDipole, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) {
+#if __CUDA_ARCH__ < 500
    real array[PME_ORDER*PME_ORDER];
+#else
+    // We have shared memory to spare, and putting the workspace array there reduces the load on L2 cache.
+    __shared__ real sharedArray[PME_ORDER*PME_ORDER*64];
+    real* array = &sharedArray[PME_ORDER*PME_ORDER*threadIdx.x];
+#endif
    real4 theta1[PME_ORDER];
    real4 theta2[PME_ORDER];
    real4 theta3[PME_ORDER];
@@ -582,26 +592,26 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
            tuv012 += tu01*v.z;
            tuv111 += tu11*v.y;
        }
-        phi[20*m] = tuv000;
+        phi[m] = tuv000;
-        phi[20*m+1] = tuv100;
+        phi[m+NUM_ATOMS] = tuv100;
-        phi[20*m+2] = tuv010;
+        phi[m+NUM_ATOMS*2] = tuv010;
-        phi[20*m+3] = tuv001;
+        phi[m+NUM_ATOMS*3] = tuv001;
-        phi[20*m+4] = tuv200;
+        phi[m+NUM_ATOMS*4] = tuv200;
-        phi[20*m+5] = tuv020;
+        phi[m+NUM_ATOMS*5] = tuv020;
-        phi[20*m+6] = tuv002;
+        phi[m+NUM_ATOMS*6] = tuv002;
-        phi[20*m+7] = tuv110;
+        phi[m+NUM_ATOMS*7] = tuv110;
-        phi[20*m+8] = tuv101;
+        phi[m+NUM_ATOMS*8] = tuv101;
-        phi[20*m+9] = tuv011;
+        phi[m+NUM_ATOMS*9] = tuv011;
-        phi[20*m+10] = tuv300;
+        phi[m+NUM_ATOMS*10] = tuv300;
-        phi[20*m+11] = tuv030;
+        phi[m+NUM_ATOMS*11] = tuv030;
-        phi[20*m+12] = tuv003;
+        phi[m+NUM_ATOMS*12] = tuv003;
-        phi[20*m+13] = tuv210;
+        phi[m+NUM_ATOMS*13] = tuv210;
-        phi[20*m+14] = tuv201;
+        phi[m+NUM_ATOMS*14] = tuv201;
-        phi[20*m+15] = tuv120;
+        phi[m+NUM_ATOMS*15] = tuv120;
-        phi[20*m+16] = tuv021;
+        phi[m+NUM_ATOMS*16] = tuv021;
-        phi[20*m+17] = tuv102;
+        phi[m+NUM_ATOMS*17] = tuv102;
-        phi[20*m+18] = tuv012;
+        phi[m+NUM_ATOMS*18] = tuv012;
-        phi[20*m+19] = tuv111;
+        phi[m+NUM_ATOMS*19] = tuv111;
        real dipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
        long long fieldx = (long long) ((dipoleScale*labFrameDipole[m*3]-tuv100*fracToCart[0][0]-tuv010*fracToCart[0][1]-tuv001*fracToCart[0][2])*0x100000000);
        fieldBuffers[m] = fieldx;
@@ -619,7 +629,13 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
        real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX,
        real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) {
+#if __CUDA_ARCH__ < 500
    real array[PME_ORDER*PME_ORDER];
+#else
+    // We have shared memory to spare, and putting the workspace array there reduces the load on L2 cache.
+    __shared__ real sharedArray[PME_ORDER*PME_ORDER*64];
+    real* array = &sharedArray[PME_ORDER*PME_ORDER*threadIdx.x];
+#endif
    real4 theta1[PME_ORDER];
    real4 theta2[PME_ORDER];
    real4 theta3[PME_ORDER];
@@ -812,55 +828,55 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
            tuv012 += tu01*v.z;
            tuv111 += tu11*v.y;
        }
-        phid[10*m]   = 0;
+        phid[m]   = 0;
-        phid[10*m+1] = tuv100_1;
+        phid[m+NUM_ATOMS] = tuv100_1;
-        phid[10*m+2] = tuv010_1;
+        phid[m+NUM_ATOMS*2] = tuv010_1;
-        phid[10*m+3] = tuv001_1;
+        phid[m+NUM_ATOMS*3] = tuv001_1;
-        phid[10*m+4] = tuv200_1;
+        phid[m+NUM_ATOMS*4] = tuv200_1;
-        phid[10*m+5] = tuv020_1;
+        phid[m+NUM_ATOMS*5] = tuv020_1;
-        phid[10*m+6] = tuv002_1;
+        phid[m+NUM_ATOMS*6] = tuv002_1;
-        phid[10*m+7] = tuv110_1;
+        phid[m+NUM_ATOMS*7] = tuv110_1;
-        phid[10*m+8] = tuv101_1;
+        phid[m+NUM_ATOMS*8] = tuv101_1;
-        phid[10*m+9] = tuv011_1;
+        phid[m+NUM_ATOMS*9] = tuv011_1;
-        phip[10*m]   = 0;
+        phip[m]   = 0;
-        phip[10*m+1] = tuv100_2;
+        phip[m+NUM_ATOMS] = tuv100_2;
-        phip[10*m+2] = tuv010_2;
+        phip[m+NUM_ATOMS*2] = tuv010_2;
-        phip[10*m+3] = tuv001_2;
+        phip[m+NUM_ATOMS*3] = tuv001_2;
-        phip[10*m+4] = tuv200_2;
+        phip[m+NUM_ATOMS*4] = tuv200_2;
-        phip[10*m+5] = tuv020_2;
+        phip[m+NUM_ATOMS*5] = tuv020_2;
-        phip[10*m+6] = tuv002_2;
+        phip[m+NUM_ATOMS*6] = tuv002_2;
-        phip[10*m+7] = tuv110_2;
+        phip[m+NUM_ATOMS*7] = tuv110_2;
-        phip[10*m+8] = tuv101_2;
+        phip[m+NUM_ATOMS*8] = tuv101_2;
-        phip[10*m+9] = tuv011_2;
+        phip[m+NUM_ATOMS*9] = tuv011_2;
-        phidp[20*m] = tuv000;
+        phidp[m] = tuv000;
-        phidp[20*m+1] = tuv100;
+        phidp[m+NUM_ATOMS*1] = tuv100;
-        phidp[20*m+2] = tuv010;
+        phidp[m+NUM_ATOMS*2] = tuv010;
-        phidp[20*m+3] = tuv001;
+        phidp[m+NUM_ATOMS*3] = tuv001;
-        phidp[20*m+4] = tuv200;
+        phidp[m+NUM_ATOMS*4] = tuv200;
-        phidp[20*m+5] = tuv020;
+        phidp[m+NUM_ATOMS*5] = tuv020;
-        phidp[20*m+6] = tuv002;
+        phidp[m+NUM_ATOMS*6] = tuv002;
-        phidp[20*m+7] = tuv110;
+        phidp[m+NUM_ATOMS*7] = tuv110;
-        phidp[20*m+8] = tuv101;
+        phidp[m+NUM_ATOMS*8] = tuv101;
-        phidp[20*m+9] = tuv011;
+        phidp[m+NUM_ATOMS*9] = tuv011;
-        phidp[20*m+10] = tuv300;
+        phidp[m+NUM_ATOMS*10] = tuv300;
-        phidp[20*m+11] = tuv030;
+        phidp[m+NUM_ATOMS*11] = tuv030;
-        phidp[20*m+12] = tuv003;
+        phidp[m+NUM_ATOMS*12] = tuv003;
-        phidp[20*m+13] = tuv210;
+        phidp[m+NUM_ATOMS*13] = tuv210;
-        phidp[20*m+14] = tuv201;
+        phidp[m+NUM_ATOMS*14] = tuv201;
-        phidp[20*m+15] = tuv120;
+        phidp[m+NUM_ATOMS*15] = tuv120;
-        phidp[20*m+16] = tuv021;
+        phidp[m+NUM_ATOMS*16] = tuv021;
-        phidp[20*m+17] = tuv102;
+        phidp[m+NUM_ATOMS*17] = tuv102;
-        phidp[20*m+18] = tuv012;
+        phidp[m+NUM_ATOMS*18] = tuv012;
-        phidp[20*m+19] = tuv111;
+        phidp[m+NUM_ATOMS*19] = tuv111;
    }
 }
 extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers,
        long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
        const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole,
-        const real* __restrict__ phi_global, const real* __restrict__ cphi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+        const real* __restrict__ phi, const real* __restrict__ cphi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    real multipole[10];
    const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
    const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
@@ -922,13 +938,12 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
        multipole[8] = fracQuadrupole[i*6+2];
        multipole[9] = fracQuadrupole[i*6+4];
-        const real* phi = &phi_global[20*i];
        real4 f = make_real4(0, 0, 0, 0);
        for (int k = 0; k < 10; k++) {
-            energy += multipole[k]*phi[k];
+            energy += multipole[k]*phi[i+NUM_ATOMS*k];
-            f.x += multipole[k]*phi[deriv1[k]];
+            f.x += multipole[k]*phi[i+NUM_ATOMS*deriv1[k]];
-            f.y += multipole[k]*phi[deriv2[k]];
+            f.y += multipole[k]*phi[i+NUM_ATOMS*deriv2[k]];
-            f.z += multipole[k]*phi[deriv3[k]];
+            f.z += multipole[k]*phi[i+NUM_ATOMS*deriv3[k]];
        }
        f = make_real4(EPSILON_FACTOR*(f.x*fracToCart[0][0] + f.y*fracToCart[0][1] + f.z*fracToCart[0][2]),
                       EPSILON_FACTOR*(f.x*fracToCart[1][0] + f.y*fracToCart[1][1] + f.z*fracToCart[1][2]),
@@ -944,8 +959,8 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
        long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
        const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole,
        const real* __restrict__ inducedDipole_global, const real* __restrict__ inducedDipolePolar_global,
-        const real* __restrict__ phi_global, const real* __restrict__ phid_global, const real* __restrict__ phip_global,
+        const real* __restrict__ phi, const real* __restrict__ phid, const real* __restrict__ phip,
-        const real* __restrict__ phidp_global, const real* __restrict__ cphi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+        const real* __restrict__ phidp, const real* __restrict__ cphi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    real multipole[10];
    real cinducedDipole[3], inducedDipole[3];
    real cinducedDipolePolar[3], inducedDipolePolar[3];
@@ -1023,34 +1038,30 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
        inducedDipolePolar[0] = cinducedDipolePolar[0]*fracToCart[0][0] + cinducedDipolePolar[1]*fracToCart[1][0] + cinducedDipolePolar[2]*fracToCart[2][0];
        inducedDipolePolar[1] = cinducedDipolePolar[0]*fracToCart[0][1] + cinducedDipolePolar[1]*fracToCart[1][1] + cinducedDipolePolar[2]*fracToCart[2][1];
        inducedDipolePolar[2] = cinducedDipolePolar[0]*fracToCart[0][2] + cinducedDipolePolar[1]*fracToCart[1][2] + cinducedDipolePolar[2]*fracToCart[2][2];
-        const real* phi = &phi_global[20*i];
-        const real* phip = &phip_global[10*i];
-        const real* phid = &phid_global[10*i];
        real4 f = make_real4(0, 0, 0, 0);
-        energy += inducedDipole[0]*phi[1];
+        energy += inducedDipole[0]*phi[i+NUM_ATOMS];
-        energy += inducedDipole[1]*phi[2];
+        energy += inducedDipole[1]*phi[i+NUM_ATOMS*2];
-        energy += inducedDipole[2]*phi[3];
+        energy += inducedDipole[2]*phi[i+NUM_ATOMS*3];
        for (int k = 0; k < 3; k++) {
            int j1 = deriv1[k+1];
            int j2 = deriv2[k+1];
            int j3 = deriv3[k+1];
-            f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1];
+            f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j1];
-            f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2];
+            f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j2];
-            f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3];
+            f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j3];
 #ifndef DIRECT_POLARIZATION
-            f.x += (inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1]);
+            f.x += (inducedDipole[k]*phip[i+NUM_ATOMS*j1] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j1]);
-            f.y += (inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2]);
+            f.y += (inducedDipole[k]*phip[i+NUM_ATOMS*j2] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j2]);
-            f.z += (inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3]);
+            f.z += (inducedDipole[k]*phip[i+NUM_ATOMS*j3] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j3]);
 #endif
        }
-        const real* phidp = &phidp_global[20*i];
        for (int k = 0; k < 10; k++) {
-            f.x += multipole[k]*phidp[deriv1[k]];
+            f.x += multipole[k]*phidp[i+NUM_ATOMS*deriv1[k]];
-            f.y += multipole[k]*phidp[deriv2[k]];
+            f.y += multipole[k]*phidp[i+NUM_ATOMS*deriv2[k]];
-            f.z += multipole[k]*phidp[deriv3[k]];
+            f.z += multipole[k]*phidp[i+NUM_ATOMS*deriv3[k]];
        }
        f = make_real4(0.5f*EPSILON_FACTOR*(f.x*fracToCart[0][0] + f.y*fracToCart[0][1] + f.z*fracToCart[0][2]),
                       0.5f*EPSILON_FACTOR*(f.x*fracToCart[1][0] + f.y*fracToCart[1][1] + f.z*fracToCart[1][2]),
@@ -1078,11 +1089,11 @@ extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ ph
    }
    __syncthreads();
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        inducedField[i] -= (long long) (0x100000000*(phid[10*i+1]*fracToCart[0][0] + phid[10*i+2]*fracToCart[0][1] + phid[10*i+3]*fracToCart[0][2]));
+        inducedField[i] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[0][0] + phid[i+NUM_ATOMS*2]*fracToCart[0][1] + phid[i+NUM_ATOMS*3]*fracToCart[0][2]));
-        inducedField[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phid[10*i+1]*fracToCart[1][0] + phid[10*i+2]*fracToCart[1][1] + phid[10*i+3]*fracToCart[1][2]));
+        inducedField[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[1][0] + phid[i+NUM_ATOMS*2]*fracToCart[1][1] + phid[i+NUM_ATOMS*3]*fracToCart[1][2]));
-        inducedField[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phid[10*i+1]*fracToCart[2][0] + phid[10*i+2]*fracToCart[2][1] + phid[10*i+3]*fracToCart[2][2]));
+        inducedField[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[2][0] + phid[i+NUM_ATOMS*2]*fracToCart[2][1] + phid[i+NUM_ATOMS*3]*fracToCart[2][2]));
-        inducedFieldPolar[i] -= (long long) (0x100000000*(phip[10*i+1]*fracToCart[0][0] + phip[10*i+2]*fracToCart[0][1] + phip[10*i+3]*fracToCart[0][2]));
+        inducedFieldPolar[i] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[0][0] + phip[i+NUM_ATOMS*2]*fracToCart[0][1] + phip[i+NUM_ATOMS*3]*fracToCart[0][2]));
-        inducedFieldPolar[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phip[10*i+1]*fracToCart[1][0] + phip[10*i+2]*fracToCart[1][1] + phip[10*i+3]*fracToCart[1][2]));
+        inducedFieldPolar[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[1][0] + phip[i+NUM_ATOMS*2]*fracToCart[1][1] + phip[i+NUM_ATOMS*3]*fracToCart[1][2]));
-        inducedFieldPolar[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phip[10*i+1]*fracToCart[2][0] + phip[10*i+2]*fracToCart[2][1] + phip[10*i+3]*fracToCart[2][2]));
+        inducedFieldPolar[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[2][0] + phip[i+NUM_ATOMS*2]*fracToCart[2][1] + phip[i+NUM_ATOMS*3]*fracToCart[2][2]));
    }
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoles.cu
 extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4* __restrict__ multipoleParticles, float* __restrict__ molecularDipoles,
-        float* __restrict__ molecularQuadrupoles, real* __restrict__ labFrameDipoles, real* __restrict__ labFrameQuadrupoles) {
+        float* __restrict__ molecularQuadrupoles, real* __restrict__ labFrameDipoles, real* __restrict__ labFrameQuadrupoles,
-    // get coordinates of this atom and the z & x axis atoms
+        real* __restrict__ sphericalDipoles, real* __restrict__ sphericalQuadrupoles) {
-    // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
-    // this atom and the axis atom
-    // this atom is referred to as the k-atom in notes below
-    // code common to ZThenX and Bisector
    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += gridDim.x*blockDim.x) {
+        // Load the spherical multipoles.
+        int offset = 3*atom;
+        sphericalDipoles[offset+0] = molecularDipoles[offset+2]; // z -> Q_10
+        sphericalDipoles[offset+1] = molecularDipoles[offset+0]; // x -> Q_11c
+        sphericalDipoles[offset+2] = molecularDipoles[offset+1]; // y -> Q_11s
+        offset = 5*atom;
+        sphericalQuadrupoles[offset+0] = -3.0f*(molecularQuadrupoles[offset+0]+molecularQuadrupoles[offset+3]); // zz -> Q_20
+        sphericalQuadrupoles[offset+1] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+2]; // xz -> Q_21c
+        sphericalQuadrupoles[offset+2] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+4]; // yz -> Q_21s
+        sphericalQuadrupoles[offset+3] = SQRT((real) 3)*(molecularQuadrupoles[offset+0]-molecularQuadrupoles[offset+3]); // xx-yy -> Q_22c
+        sphericalQuadrupoles[offset+4] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+1]; // xy -> Q_22s
+        // get coordinates of this atom and the z & x axis atoms
+        // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
+        // this atom and the axis atom
+        // this atom is referred to as the k-atom in notes below
+        // code common to ZThenX and Bisector
        int4 particles = multipoleParticles[atom];
        if (particles.x >= 0 && particles.z >= 0) {
            real4 thisParticlePos = posq[atom];
@@ -149,7 +163,7 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
            // Transform the dipole
-            unsigned int offset = 3*atom;
+            offset = 3*atom;
            real molDipole[3];
            molDipole[0] = molecularDipoles[offset];
            molDipole[1] = molecularDipoles[offset+1];
@@ -192,6 +206,67 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
            labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.z*mPoleXX + vectorY.z*mPoleXY + vectorZ.z*mPoleXZ)
                                        + vectorY.y*(vectorX.z*mPoleXY + vectorY.z*mPoleYY + vectorZ.z*mPoleYZ)
                                        + vectorZ.y*(vectorX.z*mPoleXZ + vectorY.z*mPoleYZ + vectorZ.z*mPoleZZ);
+            // ---------------------------------------------------------------------------------------
+            // Now transform the spherical multipoles.  First do the dipoles.
+            offset = 3*atom;
+            real sphericalDipole[3];
+            sphericalDipole[0] = sphericalDipoles[offset];
+            sphericalDipole[1] = sphericalDipoles[offset+1];
+            sphericalDipole[2] = sphericalDipoles[offset+2];
+            if (reverse)
+                sphericalDipole[2] *= -1;
+            sphericalDipoles[offset] = sphericalDipole[0]*vectorZ.z + sphericalDipole[1]*vectorX.z + sphericalDipole[2]*vectorY.z;
+            sphericalDipoles[offset+1] = sphericalDipole[0]*vectorZ.x + sphericalDipole[1]*vectorX.x + sphericalDipole[2]*vectorY.x;
+            sphericalDipoles[offset+2] = sphericalDipole[0]*vectorZ.y + sphericalDipole[1]*vectorX.y + sphericalDipole[2]*vectorY.y;
+            // Now the quadrupoles.
+            offset = 5*atom;
+            real sphericalQuadrupole[5];
+            sphericalQuadrupole[0] = sphericalQuadrupoles[offset];
+            sphericalQuadrupole[1] = sphericalQuadrupoles[offset+1];
+            sphericalQuadrupole[2] = sphericalQuadrupoles[offset+2];
+            sphericalQuadrupole[3] = sphericalQuadrupoles[offset+3];
+            sphericalQuadrupole[4] = sphericalQuadrupoles[offset+4];
+            if (reverse) {
+                sphericalQuadrupole[2] *= -1;
+                sphericalQuadrupole[4] *= -1;
+            }
+            real rotatedQuadrupole[5] = {0, 0, 0, 0, 0};
+            real sqrtThree = SQRT((real) 3);
+            rotatedQuadrupole[0] += sphericalQuadrupole[0]*0.5f*(3.0f*vectorZ.z*vectorZ.z - 1.0f) +
+                                    sphericalQuadrupole[1]*sqrtThree*vectorZ.z*vectorX.z +
+                                    sphericalQuadrupole[2]*sqrtThree*vectorZ.z*vectorY.z +
+                                    sphericalQuadrupole[3]*0.5f*sqrtThree*(vectorX.z*vectorX.z - vectorY.z*vectorY.z) +
+                                    sphericalQuadrupole[4]*sqrtThree*vectorX.z*vectorY.z;
+            rotatedQuadrupole[1] += sphericalQuadrupole[0]*sqrtThree*vectorZ.z*vectorZ.x +
+                                    sphericalQuadrupole[1]*(vectorZ.x*vectorX.z + vectorZ.z*vectorX.x) +
+                                    sphericalQuadrupole[2]*(vectorZ.x*vectorY.z + vectorZ.z*vectorY.x) +
+                                    sphericalQuadrupole[3]*(vectorX.z*vectorX.x - vectorY.z*vectorY.x) +
+                                    sphericalQuadrupole[4]*(vectorX.x*vectorY.z + vectorX.z*vectorY.x);
+            rotatedQuadrupole[2] += sphericalQuadrupole[0]*sqrtThree*vectorZ.z*vectorZ.y +
+                                    sphericalQuadrupole[1]*(vectorZ.y*vectorX.z + vectorZ.z*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.y*vectorY.z + vectorZ.z*vectorY.y) +
+                                    sphericalQuadrupole[3]*(vectorX.z*vectorX.y - vectorY.z*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.y*vectorY.z + vectorX.z*vectorY.y);
+            rotatedQuadrupole[3] += sphericalQuadrupole[0]*0.5f*sqrtThree*(vectorZ.x*vectorZ.x - vectorZ.y*vectorZ.y) +
+                                    sphericalQuadrupole[1]*(vectorZ.x*vectorX.x - vectorZ.y*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.x*vectorY.x - vectorZ.y*vectorY.y) +
+                                    sphericalQuadrupole[3]*0.5f*(vectorX.x*vectorX.x - vectorX.y*vectorX.y - vectorY.x*vectorY.x + vectorY.y*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.x*vectorY.x - vectorX.y*vectorY.y);
+            rotatedQuadrupole[4] += sphericalQuadrupole[0]*sqrtThree*vectorZ.x*vectorZ.y +
+                                    sphericalQuadrupole[1]*(vectorZ.y*vectorX.x + vectorZ.x*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.y*vectorY.x + vectorZ.x*vectorY.y) +
+                                    sphericalQuadrupole[3]*(vectorX.x*vectorX.y - vectorY.x*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.y*vectorY.x + vectorX.x*vectorY.y);
+            sphericalQuadrupoles[offset] = rotatedQuadrupole[0];
+            sphericalQuadrupoles[offset+1] = rotatedQuadrupole[1];
+            sphericalQuadrupoles[offset+2] = rotatedQuadrupole[2];
+            sphericalQuadrupoles[offset+3] = rotatedQuadrupole[3];
+            sphericalQuadrupoles[offset+4] = rotatedQuadrupole[4];
        }
        else {
            labFrameDipoles[3*atom] = molecularDipoles[3*atom];

--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForce.cu
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionF1(
-#else
-computeOneInteractionF1NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor,
-#ifdef APPLY_SCALE
-        float dScale, float pScale, float mScale,
-#endif
-        real3& force, real& energy) {
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-#ifdef APPLY_SCALE
-    real rr1 = delta.w;
-#endif
-    // set the permanent multipole and induced dipole values;
-    real ci = atom1.q;
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-    real qi1 = atom1.quadrupoleXX;
-    real qi2 = atom1.quadrupoleXY;
-    real qi3 = atom1.quadrupoleXZ;
-    real qi5 = atom1.quadrupoleYY;
-    real qi6 = atom1.quadrupoleYZ;
-    real qi9 = -(atom1.quadrupoleXX + atom1.quadrupoleYY);
-    real ck = atom2.q;
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-    real qk1 = atom2.quadrupoleXX;
-    real qk2 = atom2.quadrupoleXY;
-    real qk3 = atom2.quadrupoleXZ;
-    real qk5 = atom2.quadrupoleYY;
-    real qk6 = atom2.quadrupoleYZ;
-    real qk9 = -(atom2.quadrupoleXX + atom2.quadrupoleYY);
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-#ifdef APPLY_SCALE
-    real offset = 1-mScale;
-    real rr3 = rr1*rr1*rr1;
-    real gf4 = 2*(bn2 - 3*offset*rr3*rr1*rr1);
-#else
-    real gf4 = 2*bn2;
-#endif
-    real qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
-    real qkdi1 = qk1*di1 + qk2*di2 + qk3*di3;
-    real ftm21 = gf4*(qkdi1-qidk1);
-    real qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
-    real qkdi2 = qk2*di1 + qk5*di2 + qk6*di3;
-    real ftm22 = gf4*(qkdi2-qidk2);
-    real qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
-    real qkdi3 = qk3*di1 + qk6*di2 + qk9*di3;
-    real ftm23 = gf4*(qkdi3-qidk3);
-    real qir1 = qi1*xr + qi2*yr + qi3*zr;
-    real qir2 = qi2*xr + qi5*yr + qi6*zr;
-    real qir3 = qi3*xr + qi6*yr + qi9*zr;
-    real qkr1 = qk1*xr + qk2*yr + qk3*zr;
-    real qkr2 = qk2*xr + qk5*yr + qk6*zr;
-    real qkr3 = qk3*xr + qk6*yr + qk9*zr;
-#ifdef APPLY_SCALE
-    real gf7 = 4*(bn3 - 15*offset*rr3*rr3*rr1);
-#else
-    real gf7 = 4*bn3;
-#endif
-    real qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
-    real qkqir1 = qk1*qir1 + qk2*qir2 + qk3*qir3;
-    ftm21 += gf7*(qiqkr1+qkqir1);
-    real qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
-    real qkqir2 = qk2*qir1 + qk5*qir2 + qk6*qir3;
-    ftm22 += gf7*(qiqkr2+qkqir2);
-    real qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
-    real qkqir3 = qk3*qir1 + qk6*qir2 + qk9*qir3;
-    ftm23 += gf7*(qiqkr3+qkqir3);
-    // calculate the scalar products for permanent components
-    real gl6 = di1*dk1 + di2*dk2 + di3*dk3;
-    real gl7 =  2*(qir1*dk1 + qir2*dk2 + qir3*dk3 - (qkr1*di1 + qkr2*di2 + qkr3*di3));
-    real gl5 = -4*(qir1*qkr1 + qir2*qkr2 + qir3*qkr3);
-    real gl8 =  2*(qi1*qk1 + qi2*qk2 + qi3*qk3 + qi2*qk2 + qi5*qk5 + qi6*qk6 + qi3*qk3 + qi6*qk6 + qi9*qk9);
-    real sc3 = di1*xr + di2*yr + di3*zr;
-    real sc5 = qir1*xr + qir2*yr + qir3*zr;
-    real sc4 = dk1*xr + dk2*yr + dk3*zr;
-    real sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
-    real gl0 = ci*ck;
-    real gl1 = ck*sc3 - ci*sc4;
-    real gl2 = ci*sc6 + ck*sc5 - sc3*sc4;
-    real gl3 = sc3*sc6 - sc4*sc5;
-    real gl4 = sc5*sc6;
-#ifdef APPLY_SCALE
-    energy += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3*rr3*rr1*rr1))*(gl2+gl7+gl8) + (bn3-offset*(15*rr3*rr3*rr1))*(gl3+gl5) + (bn4-offset*(105*rr3*rr3*rr3))*gl4);
-#else
-    energy += forceFactor*(bn1*(gl1+gl6) + bn2*(gl2+gl7+gl8) + bn3*(gl3+gl5) + bn4*gl4);
-#endif
-    real gf1 = bn1*gl0 + bn2*(gl1+gl6) + bn3*(gl2+gl7+gl8) + bn4*(gl3+gl5) + bn5*gl4;
-#ifdef APPLY_SCALE
-    gf1 -= offset*(rr3*gl0 + (3*rr3*rr1*rr1)*(gl1+gl6) + (15*rr3*rr3*rr1)*(gl2+gl7+gl8) + (105*rr3*rr3*rr3)*(gl3+gl5) + (945*rr3*rr3*rr3*rr1*rr1)*gl4);
-#endif
-    ftm21 += gf1*xr;
-    ftm22 += gf1*yr;
-    ftm23 += gf1*zr;
-#ifdef APPLY_SCALE
-    real gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3 - offset*(-ck*rr3 + sc4*(3*rr3*rr1*rr1) - sc6*(15*rr3*rr3*rr1));
-#else
-    real gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
-#endif
-    ftm21 += gf2*di1;
-    ftm22 += gf2*di2;
-    ftm23 += gf2*di3;
-#ifdef APPLY_SCALE
-    real gf5 = 2*(-ck*bn2+sc4*bn3-sc6*bn4 - offset*(-ck*(3*rr3*rr1*rr1)+sc4*(15*rr3*rr3*rr1)-sc6*(105*rr3*rr3*rr3)));
-#else
-    real gf5 = 2*(-ck*bn2+sc4*bn3-sc6*bn4);
-#endif
-    ftm21 += gf5*qir1;
-    ftm22 += gf5*qir2;
-    ftm23 += gf5*qir3;
-#ifdef APPLY_SCALE
-    real gf3 = ci*bn1 + sc3*bn2 + sc5*bn3 - offset*(ci*rr3 + sc3*(3*rr3*rr1*rr1) + sc5*(15*rr3*rr3*rr1));
-#else
-    real gf3 = ci*bn1 + sc3*bn2 + sc5*bn3;
-#endif
-    ftm21 += gf3*dk1;
-    ftm22 += gf3*dk2;
-    ftm23 += gf3*dk3;
-#ifdef APPLY_SCALE
-    real gf6 = 2*(-ci*bn2-sc3*bn3-sc5*bn4 - offset*(-ci*(3*rr3*rr1*rr1)-sc3*(15*rr3*rr3*rr1)-sc5*(105*rr3*rr3*rr3)));
-#else
-    real gf6 = 2*(-ci*bn2-sc3*bn3-sc5*bn4);
-#endif
-    ftm21 += gf6*qkr1;
-    ftm22 += gf6*qkr2;
-    ftm23 += gf6*qkr3;
-    force.x = ftm21;
-    force.y = ftm22;
-    force.z = ftm23;
-}
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionF2(
-#else
-computeOneInteractionF2NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor,
-#ifdef APPLY_SCALE
-        float dScale, float pScale, float mScale,
-#endif
-        real3& force, real& energy) {
-    const float uScale = 1;
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-    real rr1 = delta.w;
-    // set the permanent multipole and induced dipole values;
-    real ci = atom1.q;
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-    real qi1 = atom1.quadrupoleXX;
-    real qi2 = atom1.quadrupoleXY;
-    real qi3 = atom1.quadrupoleXZ;
-    real qi5 = atom1.quadrupoleYY;
-    real qi6 = atom1.quadrupoleYZ;
-    real qi9 = -(atom1.quadrupoleXX + atom1.quadrupoleYY);
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-    real damp = atom1.damp*atom2.damp;
-    if (damp != 0) {
-        real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
-        real ratio = RECIP(rr1*damp);
-        damp = -pgamma*ratio*ratio*ratio;
-    }
-    real scale5 = (damp == 0) ? 1 : (1 - (1-damp)*EXP(damp));
-    real rr5 = rr1*rr1;
-          rr5 = 3*rr1*rr5*rr5;
-#ifdef APPLY_SCALE
-    real psc5 = rr5*(1 - scale5*pScale);
-    real dsc5 = rr5*(1 - scale5*dScale);
-    real usc5 = rr5*(1 - scale5*uScale);
-#else
-    real psc5 = rr5*(1 - scale5);
-#endif
-    real qiuk1 = qi1*atom2.inducedDipole.x + qi2*atom2.inducedDipole.y + qi3*atom2.inducedDipole.z;
-    real qiukp1 = qi1*atom2.inducedDipolePolar.x + qi2*atom2.inducedDipolePolar.y + qi3*atom2.inducedDipolePolar.z;
-    real ftm21 = -bn2*(qiuk1+qiukp1);
-#ifdef APPLY_SCALE
-          ftm21 += qiuk1*psc5 + qiukp1*dsc5;
-#else
-          ftm21 += (qiuk1 + qiukp1)*psc5;
-#endif
-    real qiuk2 = qi2*atom2.inducedDipole.x + qi5*atom2.inducedDipole.y + qi6*atom2.inducedDipole.z;
-    real qiukp2 = qi2*atom2.inducedDipolePolar.x + qi5*atom2.inducedDipolePolar.y + qi6*atom2.inducedDipolePolar.z;
-    real ftm22 = -bn2*(qiuk2+qiukp2);
-#ifdef APPLY_SCALE
-          ftm22 += ((qiuk2)*psc5 + (qiukp2)*dsc5);
-#else
-          ftm22 += (qiuk2 + qiukp2)*psc5;
-#endif
-    real qiuk3 = qi3*atom2.inducedDipole.x + qi6*atom2.inducedDipole.y + qi9*atom2.inducedDipole.z;
-    real qiukp3 = qi3*atom2.inducedDipolePolar.x + qi6*atom2.inducedDipolePolar.y + qi9*atom2.inducedDipolePolar.z;
-    real ftm23 = -bn2*(qiuk3+qiukp3);
-#ifdef APPLY_SCALE
-          ftm23 += ((qiuk3)*psc5 + (qiukp3)*dsc5);
-#else
-          ftm23 += (qiuk3 + qiukp3)*psc5;
-#endif
-    real expdamp = EXP(damp);
-    real scale3 = (damp == 0) ? 1 : (1 - expdamp);
-    real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-    real psc3 = rr3*(1 - scale3*pScale);
-    real dsc3 = rr3*(1 - scale3*dScale);
-    real usc3 = rr3*(1 - scale3*uScale);
-#else
-    real psc3 = rr3*(1 - scale3);
-#endif
-    real scale7 = (damp == 0) ? 1 : (1 - (1-damp+0.6f*damp*damp)*expdamp);
-#ifdef APPLY_SCALE
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7*pScale);
-    real dsc7 = (15*rr3*rr3*rr1)*(1 - scale7*dScale);
-#else
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7);
-#endif
-    real qir1 = qi1*xr + qi2*yr + qi3*zr;
-    real qir2 = qi2*xr + qi5*yr + qi6*zr;
-    real qir3 = qi3*xr + qi6*yr + qi9*zr;
-    real sc3 = di1*xr + di2*yr + di3*zr;
-    real sc5 = qir1*xr + qir2*yr + qir3*zr;
-    real gfi3 = ci*bn1 + sc3*bn2 + sc5*bn3;
-    real prefactor1;
-    prefactor1 = 0.5f*(ci*psc3 + sc3*psc5 + sc5*psc7 - gfi3);
-    ftm21 -= prefactor1*atom2.inducedDipole.x;
-    ftm22 -= prefactor1*atom2.inducedDipole.y;
-    ftm23 -= prefactor1*atom2.inducedDipole.z;
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(ci*dsc3 + sc3*dsc5 + sc5*dsc7 - gfi3);
-#endif
-    ftm21 -= prefactor1*atom2.inducedDipolePolar.x;
-    ftm22 -= prefactor1*atom2.inducedDipolePolar.y;
-    ftm23 -= prefactor1*atom2.inducedDipolePolar.z;
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    energy += forceFactor*0.5f*sci4*((psc3-bn1)*ci + (psc5-bn2)*sc3 + (psc7-bn3)*sc5);
-    real scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2 - usc5);
-#else
-    prefactor1 = 0.5f*(bn2 - psc5);
-#endif
-    ftm21 += prefactor1*((sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x));
-    ftm22 += prefactor1*((sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y));
-    ftm23 += prefactor1*((sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z));
-#endif
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2*(sci4+scip4) - (sci4*psc5+scip4*dsc5)); 
-#else
-    sci4 += scip4;
-    prefactor1 = 0.5f*sci4*(bn2 - psc5); 
-#endif
-    ftm21 += prefactor1*di1;
-    ftm22 += prefactor1*di2;
-    ftm23 += prefactor1*di3;
-#ifdef APPLY_SCALE
-    real gfi5 = bn3*(sci4+scip4) - (sci4*psc7+scip4*dsc7);
-#else
-    real gfi5 = sci4*(bn3 - psc7);
-#endif
-    ftm21 += gfi5*qir1;
-    ftm22 += gfi5*qir2;
-    ftm23 += gfi5*qir3;
-    real sci7 = qir1*atom2.inducedDipole.x + qir2*atom2.inducedDipole.y + qir3*atom2.inducedDipole.z;
-    energy += forceFactor*(bn2-psc5)*sci7;
-    real scip7 = qir1*atom2.inducedDipolePolar.x + qir2*atom2.inducedDipolePolar.y + qir3*atom2.inducedDipolePolar.z;
-#ifdef APPLY_SCALE
-    real gli1 = -ci*sci4;
-    real gli2 = -sc3*sci4 + 2*sci7;
-    real gli3 = -sci4*sc5;
-    real glip1 = -ci*scip4;
-    real glip2 = -sc3*scip4 + 2*scip7;
-    real glip3 = -scip4*sc5;
-#else
-    real gli1 = -ci*sci4;
-    real gli2 = -sc3*sci4 + 2*(sci7 + scip7);
-    real gli3 = -sci4*sc5;
-#endif
-#ifdef APPLY_SCALE
-    real gfi1 = (bn2*(gli1+glip1) + bn3*(gli2+glip2) + bn4*(gli3+glip3));
-    gfi1 -= (rr1*rr1)*(3*(gli1*psc3 + glip1*dsc3) + 5*(gli2*psc5 + glip2*dsc5) + 7*(gli3*psc7+glip3*dsc7));
-#else
-    real gfi1 = bn2*gli1 + bn3*gli2 + bn4*gli3;
-    gfi1 -= (rr1*rr1)*(3*gli1*psc3 + 5*gli2*psc5 + 7*gli3*psc7);
-#endif
-    gfi1 *= 0.5f;
-    ftm21 += gfi1*xr;
-    ftm22 += gfi1*yr;
-    ftm23 += gfi1*zr;
-    {
-        real expdamp = EXP(damp);
-        real temp3 = -1.5f*damp*expdamp*rr1*rr1;
-        real temp5 = -damp;
-        real temp7 = -0.2f - 0.6f*damp;
-        real ddsc31 = temp3*xr;
-        real ddsc32 = temp3*yr;
-        real ddsc33 = temp3*zr;
-        real ddsc51 = temp5*ddsc31;
-        real ddsc52 = temp5*ddsc32;
-        real ddsc53 = temp5*ddsc33;
-        real ddsc71 = temp7*ddsc51;
-        real ddsc72 = temp7*ddsc52;
-        real ddsc73 = temp7*ddsc53;
-        real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-        temp3 = (gli1*pScale + glip1*dScale);
-        temp5 = (3*rr1*rr1)*(gli2*pScale + glip2*dScale);
-        temp7 = (15*rr3*rr1)*(gli3*pScale + glip3*dScale);
-#else
-        temp3 = gli1;
-        temp5 = (3*rr1*rr1)*gli2;
-        temp7 = (15*rr3*rr1)*gli3;
-#endif
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
-    }
-//K
-    real qk1 = atom2.quadrupoleXX;
-    real qk2 = atom2.quadrupoleXY;
-    real qk3 = atom2.quadrupoleXZ;
-    real qk5 = atom2.quadrupoleYY;
-    real qk6 = atom2.quadrupoleYZ;
-    real qk9 = -(qk1 + qk5);
-    real qkui1 = qk1*atom1.inducedDipole.x + qk2*atom1.inducedDipole.y + qk3*atom1.inducedDipole.z;
-    real qkuip1 = qk1*atom1.inducedDipolePolar.x + qk2*atom1.inducedDipolePolar.y + qk3*atom1.inducedDipolePolar.z;
-          ftm21 += bn2*(qkui1+qkuip1);
-#ifdef APPLY_SCALE
-          ftm21 -= (qkui1*psc5 + qkuip1*dsc5);
-#else
-          ftm21 -= (qkui1 + qkuip1)*psc5;
-#endif
-    real qkui2 = qk2*atom1.inducedDipole.x + qk5*atom1.inducedDipole.y + qk6*atom1.inducedDipole.z;
-    real qkuip2 = qk2*atom1.inducedDipolePolar.x + qk5*atom1.inducedDipolePolar.y + qk6*atom1.inducedDipolePolar.z;
-          ftm22 += bn2*(qkui2+qkuip2);
-#ifdef APPLY_SCALE
-          ftm22 -= ((qkui2)*psc5 + (qkuip2)*dsc5);
-#else
-          ftm22 -= (qkui2 + qkuip2)*psc5;
-#endif
-    real qkui3 = qk3*atom1.inducedDipole.x + qk6*atom1.inducedDipole.y + qk9*atom1.inducedDipole.z;
-    real qkuip3 = qk3*atom1.inducedDipolePolar.x + qk6*atom1.inducedDipolePolar.y + qk9*atom1.inducedDipolePolar.z;
-          ftm23 += bn2*(qkui3+qkuip3);
-#ifdef APPLY_SCALE
-          ftm23 -= ((qkui3)*psc5 + (qkuip3)*dsc5);
-#else
-          ftm23 -= (qkui3 + qkuip3)*psc5;
-#endif
-    real qkr1 = qk1*xr + qk2*yr + qk3*zr;
-    real qkr2 = qk2*xr + qk5*yr + qk6*zr;
-    real qkr3 = qk3*xr + qk6*yr + qk9*zr;
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-    real sc4 =  dk1*xr +  dk2*yr +  dk3*zr;
-    real sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
-    real ck = atom2.q;
-    real gfi2 = (-ck*bn1 + sc4*bn2 - sc6*bn3);
-    prefactor1 = 0.5f*(ck*psc3 - sc4*psc5 + sc6*psc7 + gfi2);
-    ftm21 += prefactor1*atom1.inducedDipole.x;
-    ftm22 += prefactor1*atom1.inducedDipole.y;
-    ftm23 += prefactor1*atom1.inducedDipole.z;
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(ck*dsc3 - sc4*dsc5 + sc6*dsc7 + gfi2);
-#endif
-    ftm21 += prefactor1*atom1.inducedDipolePolar.x;
-    ftm22 += prefactor1*atom1.inducedDipolePolar.y;
-    ftm23 += prefactor1*atom1.inducedDipolePolar.z;
-    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
-    energy += forceFactor*0.5f*sci3*(ck*(bn1-psc3) - sc4*(bn2-psc5) + sc6*(bn3-psc7));
-    real scip3 = atom1.inducedDipolePolar.x*xr + atom1.inducedDipolePolar.y*yr + atom1.inducedDipolePolar.z*zr;
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2 - usc5);
-#else
-    prefactor1 = 0.5f*(bn2 - psc5);
-#endif
-    ftm21 += prefactor1*(sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
-    ftm22 += prefactor1*(sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
-    ftm23 += prefactor1*(sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
-    real sci34;
-    sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-    sci34 = (sci3*scip4+scip3*sci4);
-#ifdef APPLY_SCALE
-    gfi1 = sci34*(usc5*(5*rr1*rr1) -bn3);
-#else
-    gfi1 = sci34*(psc5*(5*rr1*rr1) -bn3);
-#endif
-#else
-    gfi1 = 0;
-#endif
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2*(sci3+scip3) - (sci3*psc5+scip3*dsc5));
-#else
-    sci3 += scip3;
-    prefactor1 = 0.5f*sci3*(bn2 - psc5);
-#endif
-    ftm21 += prefactor1*dk1;
-    ftm22 += prefactor1*dk2;
-    ftm23 += prefactor1*dk3;
-#ifdef APPLY_SCALE
-    real gfi6 = -bn3*(sci3+scip3) + (sci3*psc7+scip3*dsc7);
-#else
-    real gfi6 = sci3*(psc7 - bn3);
-#endif
-    ftm21 += gfi6*qkr1;
-    ftm22 += gfi6*qkr2;
-    ftm23 += gfi6*qkr3;
-    real sci1 = atom1.inducedDipole.x*dk1 + atom1.inducedDipole.y*dk2 + atom1.inducedDipole.z*dk3 + di1*atom2.inducedDipole.x + di2*atom2.inducedDipole.y + di3*atom2.inducedDipole.z;
-    energy += forceFactor*0.5f*(sci1*(bn1-psc3));
-    real sci8 = qkr1*atom1.inducedDipole.x + qkr2*atom1.inducedDipole.y + qkr3*atom1.inducedDipole.z;
-    energy -= forceFactor*sci8*(bn2-psc5);
-    real scip1 = atom1.inducedDipolePolar.x*dk1 + atom1.inducedDipolePolar.y*dk2 + atom1.inducedDipolePolar.z*dk3 + di1*atom2.inducedDipolePolar.x + di2*atom2.inducedDipolePolar.y + di3*atom2.inducedDipolePolar.z;
-#ifndef APPLY_SCALE
-        sci1 += scip1;
-#endif
-    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x +
-                                  atom1.inducedDipole.y*atom2.inducedDipolePolar.y +
-                                  atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
-                                  atom2.inducedDipole.x*atom1.inducedDipolePolar.x +
-                                  atom2.inducedDipole.y*atom1.inducedDipolePolar.y +
-                                  atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
-    real scip8 = qkr1*atom1.inducedDipolePolar.x + qkr2*atom1.inducedDipolePolar.y + qkr3*atom1.inducedDipolePolar.z;
-#ifndef APPLY_SCALE
-          sci8 += scip8;
-#endif
-           gli1 = ck*sci3 + sci1;
-           gli2 = -(sci3*sc4 + 2*sci8);
-           gli3 = sci3*sc6;
-#ifdef APPLY_SCALE
-          glip1 = ck*scip3 + scip1;
-          glip2 = -(scip3*sc4 + 2*scip8);
-          glip3 = scip3*sc6;
-#endif
-#ifdef APPLY_SCALE
-    gfi1 += (bn2*(gli1+glip1) + bn3*(gli2+glip2) + bn4*(gli3+glip3));
-    gfi1 -= (rr1*rr1)*(3*(gli1*psc3 + glip1*dsc3) + 5*(gli2*psc5 + glip2*dsc5) + 7*(gli3*psc7+glip3*dsc7));
-#else
-    gfi1 += (bn2*gli1 + bn3*gli2 + bn4*gli3);
-    gfi1 -= (rr1*rr1)*(3*gli1*psc3 + 5*gli2*psc5 + 7*gli3*psc7);
-#endif
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    gfi1 += scip2*(bn2 - (3*rr1*rr1)*usc3);
-#else
-    gfi1 += scip2*(bn2 - (3*rr1*rr1)*psc3);
-#endif
-#endif
-    gfi1 *= 0.5f;
-    ftm21 += gfi1*xr;
-    ftm22 += gfi1*yr;
-    ftm23 += gfi1*zr;
-    {
-        real expdamp = EXP(damp);
-        real temp3 = -1.5f*damp*expdamp*rr1*rr1;
-        real temp5 = -damp;
-        real temp7 = -0.2f - 0.6f*damp;
-        real ddsc31 = temp3*xr;
-        real ddsc32 = temp3*yr;
-        real ddsc33 = temp3*zr;
-        real ddsc51 = temp5*ddsc31;
-        real ddsc52 = temp5*ddsc32;
-        real ddsc53 = temp5*ddsc33;
-        real ddsc71 = temp7*ddsc51;
-        real ddsc72 = temp7*ddsc52;
-        real ddsc73 = temp7*ddsc53;
-        real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-        temp3 = gli1*pScale + glip1*dScale;
-        temp5 = (3*rr1*rr1)*(gli2*pScale + glip2*dScale);
-        temp7 = (15*rr3*rr1)*(gli3*pScale + glip3*dScale);
-#else
-        temp3 = gli1;
-        temp5 = (3*rr1*rr1)*gli2;
-        temp7 = (15*rr3*rr1)*(gli3);
-#endif
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51 + temp7*ddsc71);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52 + temp7*ddsc72);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53 + temp7*ddsc73);
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-        temp3 =  uScale*scip2;
-        temp5 = -(3*rr1*rr1)*uScale*sci34;
-#else
-        temp3 =  scip2;
-        temp5 = -(3*rr1*rr1)*sci34;
-#endif
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
-#endif
-    }
-    force.x += ftm21;
-    force.y += ftm22;
-    force.z += ftm23;
-}
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionT1(
-#else
-computeOneInteractionT1NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
-#ifdef APPLY_SCALE
-        , float dScale, float pScale, float mScale
-#endif
-        ) {
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-#ifdef APPLY_SCALE
-    real rr1 = delta.w;
-#endif
-    // set the permanent multipole and induced dipole values;
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-    real qi1 = atom1.quadrupoleXX;
-    real qi2 = atom1.quadrupoleXY;
-    real qi3 = atom1.quadrupoleXZ;
-    real qi5 = atom1.quadrupoleYY;
-    real qi6 = atom1.quadrupoleYZ;
-    //real qi9 = atom1.labFrameQuadrupole[5];
-    real qi9 = -(atom1.quadrupoleXX + atom1.quadrupoleYY);
-    real ck = atom2.q;
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-    real qk1 = atom2.quadrupoleXX;
-    real qk2 = atom2.quadrupoleXY;
-    real qk3 = atom2.quadrupoleXZ;
-    real qk5 = atom2.quadrupoleYY;
-    real qk6 = atom2.quadrupoleYZ;
-    //real qk9 = atom2.labFrameQuadrupole[5];
-    real qk9 = -(atom2.quadrupoleXX + atom2.quadrupoleYY);
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-    // apply Thole polarization damping to scale factors
-#ifdef APPLY_SCALE
-    real rr2 = rr1*rr1;
-    real rr3 = rr1*rr2;
-    real rr5 = 3*rr3*rr2;
-    real rr7 = 5*rr5*rr2;
-    real rr9 = 7*rr7*rr2;
-    real scale = 1-mScale;
-    real prefactor = scale*rr3 - bn1;
-#else
-    real prefactor = -bn1;
-#endif
-    real dixdk1 = di2*dk3 - di3*dk2;
-    real ttm21 = prefactor*dixdk1;
-    real dixdk2 = di3*dk1 - di1*dk3;
-    real ttm22 = prefactor*dixdk2;
-    real dixdk3 = di1*dk2 - di2*dk1;
-    real ttm23 = prefactor*dixdk3;
-    real qir1 = qi1*xr + qi2*yr + qi3*zr;
-    real qir2 = qi2*xr + qi5*yr + qi6*zr;
-    real qir3 = qi3*xr + qi6*yr + qi9*zr;
-    real qkr1 = qk1*xr + qk2*yr + qk3*zr;
-    real qkr2 = qk2*xr + qk5*yr + qk6*zr;
-    real qkr3 = qk3*xr + qk6*yr + qk9*zr;
-    real qiqkr1 = qi1*qkr1 + qi2*qkr2 + qi3*qkr3;
-    real qiqkr2 = qi2*qkr1 + qi5*qkr2 + qi6*qkr3;
-    real qiqkr3 = qi3*qkr1 + qi6*qkr2 + qi9*qkr3;
-    real rxqikr1 = yr*qiqkr3 - zr*qiqkr2;
-    real qkrxqir1 = qkr2*qir3 - qkr3*qir2;
-#ifdef APPLY_SCALE
-    prefactor = 4*(bn3 - scale*rr7);
-#else
-    prefactor = 4*bn3;
-#endif
-    ttm21 -= prefactor*(rxqikr1+qkrxqir1);
-    real rxqikr2 = zr*qiqkr1 - xr*qiqkr3;
-    real qkrxqir2 = qkr3*qir1 - qkr1*qir3;
-    ttm22 -= prefactor*(rxqikr2+qkrxqir2);
-    real rxqikr3 = xr*qiqkr2 - yr*qiqkr1;
-    real qkrxqir3 = qkr1*qir2 - qkr2*qir1;
-    ttm23 -= prefactor*(rxqikr3+qkrxqir3);
-    real qidk1 = qi1*dk1 + qi2*dk2 + qi3*dk3;
-    real qidk2 = qi2*dk1 + qi5*dk2 + qi6*dk3;
-    real qidk3 = qi3*dk1 + qi6*dk2 + qi9*dk3;
-    real dixqkr1 = di2*qkr3 - di3*qkr2;
-    real dkxqir1 = dk2*qir3 - dk3*qir2;
-    real rxqidk1 = yr*qidk3 - zr*qidk2;
-    real qixqk1 = qi2*qk3 + qi5*qk6 + qi6*qk9 - qi3*qk2 - qi6*qk5 - qi9*qk6;
-#ifdef APPLY_SCALE
-    prefactor = 2*(bn2 - scale*rr5);
-#else
-    prefactor = 2*bn2;
-#endif
-    ttm21 += prefactor*(dixqkr1+dkxqir1+rxqidk1-2*qixqk1);
-    real dixqkr2 = di3*qkr1 - di1*qkr3;
-    real dkxqir2 = dk3*qir1 - dk1*qir3;
-    real rxqidk2 = zr*qidk1 - xr*qidk3;
-    real qixqk2 = qi3*qk1 + qi6*qk2 + qi9*qk3 - qi1*qk3 - qi2*qk6 - qi3*qk9;
-    ttm22 += prefactor*(dixqkr2+dkxqir2+rxqidk2-2*qixqk2);
-    real dixqkr3 = di1*qkr2 - di2*qkr1;
-    real dkxqir3 = dk1*qir2 - dk2*qir1;
-    real rxqidk3 = xr*qidk2 - yr*qidk1;
-    real qixqk3 = qi1*qk2 + qi2*qk5 + qi3*qk6 - qi2*qk1 - qi5*qk2 - qi6*qk3;
-    ttm23 += prefactor*(dixqkr3+dkxqir3+rxqidk3-2*qixqk3);
-    real sc4 = dk1*xr + dk2*yr + dk3*zr;
-    real sc6 = qkr1*xr + qkr2*yr + qkr3*zr;
-    real gf2 = -ck*bn1 + sc4*bn2 - sc6*bn3;
-#ifdef APPLY_SCALE
-    real gfr2 = -ck*rr3 + sc4*rr5 - sc6*rr7;
-    prefactor = (gf2 - scale*gfr2);
-#else
-    prefactor = gf2;
-#endif
-    ttm21 += prefactor*(di2*zr - di3*yr);
-    ttm22 += prefactor*(di3*xr - di1*zr);
-    ttm23 += prefactor*(di1*yr - di2*xr);
-    real gf5 = (-ck*bn2+sc4*bn3-sc6*bn4);
-#ifdef APPLY_SCALE
-    real gfr5 = (-ck*rr5+sc4*rr7-sc6*rr9); 
-    prefactor = 2*(gf5 - scale*gfr5);
-#else
-    prefactor = 2*gf5;
-#endif
-    real rxqir1 = yr*qir3 - zr*qir2;
-    real rxqir2 = zr*qir1 - xr*qir3;
-    real rxqir3 = xr*qir2 - yr*qir1;
-    ttm21 -= prefactor*rxqir1; 
-    ttm22 -= prefactor*rxqir2;
-    ttm23 -= prefactor*rxqir3;
-    atom1.torque.x += ttm21;
-    atom1.torque.y += ttm22;
-    atom1.torque.z += ttm23;
-}
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionT2(
-#else
-computeOneInteractionT2NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
-#ifdef APPLY_SCALE
-        , float dScale, float pScale, float mScale
-#endif
-        ) {
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-    real rr1 = delta.w;
-    // set the permanent multipole and induced dipole values;
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-    real qi1 = atom1.quadrupoleXX;
-    real qi2 = atom1.quadrupoleXY;
-    real qi3 = atom1.quadrupoleXZ;
-    real qi5 = atom1.quadrupoleYY;
-    real qi6 = atom1.quadrupoleYZ;
-    real qi9 = -(atom1.quadrupoleXX + atom1.quadrupoleYY);
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    // apply Thole polarization damping to scale factors
-    real scale3 = 1;
-    real scale5 = 1;
-    real scale7 = 1;
-    real damp = atom1.damp*atom2.damp;
-    if (damp != 0) {
-        real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
-        real ratio = RECIP(rr1*damp);
-        damp = -pgamma*ratio*ratio*ratio;
-        real expdamp = EXP(damp);
-        scale3 = 1 - expdamp;
-        scale5 = 1 - (1-damp)*expdamp;
-        scale7 = 1 - (1-damp+0.6f*damp*damp)*expdamp;
-    }
-    real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-    real dsc3 = rr3*(1 - scale3*dScale);
-    real dsc5 = (3*rr3*rr1*rr1)* (1 - scale5*dScale);
-    real dsc7 = (15*rr3*rr3*rr1)*(1 - scale7*dScale);
-    real psc3 = rr3*(1 - scale3*pScale);
-    real psc5 = (3*rr3*rr1*rr1)*(1 - scale5*pScale);
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7*pScale);
-#else
-    real psc3 = rr3*(1 - scale3);
-    real psc5 = (3*rr3*rr1*rr1)*(1 - scale5);
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7);
-#endif
-    real prefactor1 = 0.5f*(psc3 - bn1);
-#ifdef APPLY_SCALE
-    real prefactor2 = 0.5f*(dsc3 - bn1);
-#endif
-    real dixuk1 = di2*atom2.inducedDipole.z - di3*atom2.inducedDipole.y;
-    real dixukp1 = di2*atom2.inducedDipolePolar.z - di3*atom2.inducedDipolePolar.y;
-#ifdef APPLY_SCALE
-    real ttm2i1 = prefactor1*dixuk1 + prefactor2*dixukp1;
-#else
-    real ttm2i1 = prefactor1*(dixuk1 + dixukp1);
-#endif
-    real dixuk2 = di3*atom2.inducedDipole.x - di1*atom2.inducedDipole.z;
-    real dixukp2 = di3*atom2.inducedDipolePolar.x - di1*atom2.inducedDipolePolar.z;
-#ifdef APPLY_SCALE
-    real ttm2i2 = prefactor1*dixuk2 + prefactor2*dixukp2;
-#else
-    real ttm2i2 = prefactor1*(dixuk2 + dixukp2);
-#endif
-    real dixuk3 = di1*atom2.inducedDipole.y - di2*atom2.inducedDipole.x;
-    real dixukp3 = di1*atom2.inducedDipolePolar.y - di2*atom2.inducedDipolePolar.x;
-#ifdef APPLY_SCALE
-    real ttm2i3 = prefactor1*dixuk3 + prefactor2*dixukp3;
-#else
-    real ttm2i3 = prefactor1*(dixuk3 + dixukp3);
-#endif
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    real scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-    real gti2 = bn2*(sci4+scip4);
-#ifdef APPLY_SCALE
-    real gtri2 = (sci4*psc5+scip4*dsc5);
-#else
-    real gtri2 = psc5*(sci4+scip4);
-#endif
-    prefactor1 = 0.5f*(gti2 - gtri2);
-    ttm2i1 += prefactor1*(di2*zr - di3*yr);
-    ttm2i2 += prefactor1*(di3*xr - di1*zr);
-    ttm2i3 += prefactor1*(di1*yr - di2*xr);
-    real qir1 = qi1*xr + qi2*yr + qi3*zr;
-    real qir2 = qi2*xr + qi5*yr + qi6*zr;
-    real qir3 = qi3*xr + qi6*yr + qi9*zr;
-#ifdef APPLY_SCALE
-    prefactor1 = sci4*psc7 + scip4*dsc7 - bn3*(sci4+scip4);
-#else
-    prefactor1 = psc7*(sci4+scip4) - bn3*(sci4+scip4);
-#endif
-    ttm2i1 += prefactor1*(yr*qir3 - zr*qir2);
-    ttm2i2 += prefactor1*(zr*qir1 - xr*qir3);
-    ttm2i3 += prefactor1*(xr*qir2 - yr*qir1);
-    real qiuk1 = qi1*atom2.inducedDipole.x + qi2*atom2.inducedDipole.y + qi3*atom2.inducedDipole.z;
-    real qiuk2 = qi2*atom2.inducedDipole.x + qi5*atom2.inducedDipole.y + qi6*atom2.inducedDipole.z;
-    real qiuk3 = qi3*atom2.inducedDipole.x + qi6*atom2.inducedDipole.y + qi9*atom2.inducedDipole.z;
-    real qiukp1 = qi1*atom2.inducedDipolePolar.x + qi2*atom2.inducedDipolePolar.y + qi3*atom2.inducedDipolePolar.z;
-    real qiukp2 = qi2*atom2.inducedDipolePolar.x + qi5*atom2.inducedDipolePolar.y + qi6*atom2.inducedDipolePolar.z;
-    real qiukp3 = qi3*atom2.inducedDipolePolar.x + qi6*atom2.inducedDipolePolar.y + qi9*atom2.inducedDipolePolar.z;
-    prefactor1 = (bn2 - psc5);
-#ifdef APPLY_SCALE
-    prefactor2 = (bn2 - dsc5);
-#endif
-    real ukxqir1 = atom2.inducedDipole.y*qir3 - atom2.inducedDipole.z*qir2;
-    real ukxqirp1 = atom2.inducedDipolePolar.y*qir3 - atom2.inducedDipolePolar.z*qir2;
-    real rxqiuk1 = yr*qiuk3 - zr*qiuk2;
-    real rxqiukp1 = yr*qiukp3 - zr*qiukp2;
-#ifdef APPLY_SCALE
-    ttm2i1 += prefactor1*(ukxqir1 + rxqiuk1) + prefactor2*(ukxqirp1 + rxqiukp1);
-#else
-    ttm2i1 += prefactor1*(ukxqir1 + rxqiuk1 + ukxqirp1 + rxqiukp1);
-#endif
-    real ukxqir2 = atom2.inducedDipole.z*qir1 - atom2.inducedDipole.x*qir3;
-    real ukxqirp2 = atom2.inducedDipolePolar.z*qir1 - atom2.inducedDipolePolar.x*qir3;
-    real rxqiuk2 = zr*qiuk1 - xr*qiuk3;
-    real rxqiukp2 = zr*qiukp1 - xr*qiukp3;
-#ifdef APPLY_SCALE
-    ttm2i2 += prefactor1*(ukxqir2 + rxqiuk2) + prefactor2*(ukxqirp2 + rxqiukp2);
-#else
-    ttm2i2 += prefactor1*(ukxqir2 + rxqiuk2 + ukxqirp2 + rxqiukp2);
-#endif
-    real ukxqir3 = atom2.inducedDipole.x*qir2 - atom2.inducedDipole.y*qir1;
-    real ukxqirp3 = atom2.inducedDipolePolar.x*qir2 - atom2.inducedDipolePolar.y*qir1;
-    real rxqiuk3 = xr*qiuk2 - yr*qiuk1;
-    real rxqiukp3 = xr*qiukp2 - yr*qiukp1;
-#ifdef APPLY_SCALE
-    ttm2i3 += prefactor1*(ukxqir3 + rxqiuk3) + prefactor2*(ukxqirp3 + rxqiukp3);
-#else
-    ttm2i3 += prefactor1*(ukxqir3 + rxqiuk3 + ukxqirp3 + rxqiukp3);
-#endif
-    atom1.torque.x += ttm2i1;
-    atom1.torque.y += ttm2i2;
-    atom1.torque.z += ttm2i3;
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForceNoQuadrupoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForceNoQuadrupoles.cu
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionF1(
-#else
-computeOneInteractionF1NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor,
-#ifdef APPLY_SCALE
-        float dScale, float pScale, float mScale,
-#endif
-        real3& force, real& energy) {
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-#ifdef APPLY_SCALE
-    real rr1 = delta.w;
-#endif
-    // set the permanent multipole and induced dipole values;
-    real ci = atom1.q;
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-    real ck = atom2.q;
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-#ifdef APPLY_SCALE
-    real offset = 1-mScale;
-    real rr3 = rr1*rr1*rr1;
-    real gf4 = 2*(bn2 - 3*offset*rr3*rr1*rr1);
-#else
-    real gf4 = 2*bn2;
-#endif
-    real ftm21 = 0;
-    real ftm22 = 0;
-    real ftm23 = 0;
-    // calculate the scalar products for permanent components
-    real gl6 = di1*dk1 + di2*dk2 + di3*dk3;
-    real sc3 = di1*xr + di2*yr + di3*zr;
-    real sc4 = dk1*xr + dk2*yr + dk3*zr;
-    real gl0 = ci*ck;
-    real gl1 = ck*sc3 - ci*sc4;
-    real gl2 = -sc3*sc4;
-#ifdef APPLY_SCALE
-    energy += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3*rr3*rr1*rr1))*gl2);
-#else
-    energy += forceFactor*(bn1*(gl1+gl6) + bn2*gl2);
-#endif
-    real gf1 = bn1*gl0 + bn2*(gl1+gl6) + bn3*gl2;
-#ifdef APPLY_SCALE
-    gf1 -= offset*(rr3*gl0 + (3*rr3*rr1*rr1)*(gl1+gl6) + (15*rr3*rr3*rr1)*gl2);
-#endif
-    ftm21 += gf1*xr;
-    ftm22 += gf1*yr;
-    ftm23 += gf1*zr;
-#ifdef APPLY_SCALE
-    real gf2 = -ck*bn1 + sc4*bn2 - offset*(-ck*rr3 + sc4*(3*rr3*rr1*rr1));
-#else
-    real gf2 = -ck*bn1 + sc4*bn2;
-#endif
-    ftm21 += gf2*di1;
-    ftm22 += gf2*di2;
-    ftm23 += gf2*di3;
-#ifdef APPLY_SCALE
-    real gf3 = ci*bn1 + sc3*bn2 - offset*(ci*rr3 + sc3*(3*rr3*rr1*rr1));
-#else
-    real gf3 = ci*bn1 + sc3*bn2;
-#endif
-    ftm21 += gf3*dk1;
-    ftm22 += gf3*dk2;
-    ftm23 += gf3*dk3;
-    force.x = ftm21;
-    force.y = ftm22;
-    force.z = ftm23;
-}
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionF2(
-#else
-computeOneInteractionF2NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor,
-#ifdef APPLY_SCALE
-        float dScale, float pScale, float mScale,
-#endif
-        real3& force, real& energy) {
-    const float uScale = 1;
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-    real rr1 = delta.w;
-    // set the permanent multipole and induced dipole values;
-    real ci = atom1.q;
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-    real damp = atom1.damp*atom2.damp;
-    if (damp != 0) {
-        real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
-        real ratio = RECIP(rr1*damp);
-        damp = -pgamma*ratio*ratio*ratio;
-    }
-    real scale5 = (damp == 0) ? 1 : (1 - (1-damp)*EXP(damp));
-    real rr5 = rr1*rr1;
-          rr5 = 3*rr1*rr5*rr5;
-#ifdef APPLY_SCALE
-    real psc5 = rr5*(1 - scale5*pScale);
-    real dsc5 = rr5*(1 - scale5*dScale);
-    real usc5 = rr5*(1 - scale5*uScale);
-#else
-    real psc5 = rr5*(1 - scale5);
-#endif
-    real ftm21 = 0;
-    real ftm22 = 0;
-    real ftm23 = 0;
-    real expdamp = EXP(damp);
-    real scale3 = (damp == 0) ? 1 : (1 - expdamp);
-    real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-    real psc3 = rr3*(1 - scale3*pScale);
-    real dsc3 = rr3*(1 - scale3*dScale);
-    real usc3 = rr3*(1 - scale3*uScale);
-#else
-    real psc3 = rr3*(1 - scale3);
-#endif
-    real scale7 = (damp == 0) ? 1 : (1 - (1-damp+0.6f*damp*damp)*expdamp);
-#ifdef APPLY_SCALE
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7*pScale);
-    real dsc7 = (15*rr3*rr3*rr1)*(1 - scale7*dScale);
-#else
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7);
-#endif
-    real sc3 = di1*xr + di2*yr + di3*zr;
-    real gfi3 = ci*bn1 + sc3*bn2;
-    real prefactor1;
-    prefactor1 = 0.5f*(ci*psc3 + sc3*psc5 - gfi3);
-    ftm21 -= prefactor1*atom2.inducedDipole.x;
-    ftm22 -= prefactor1*atom2.inducedDipole.y;
-    ftm23 -= prefactor1*atom2.inducedDipole.z;
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(ci*dsc3 + sc3*dsc5 - gfi3);
-#endif
-    ftm21 -= prefactor1*atom2.inducedDipolePolar.x;
-    ftm22 -= prefactor1*atom2.inducedDipolePolar.y;
-    ftm23 -= prefactor1*atom2.inducedDipolePolar.z;
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    energy += forceFactor*0.5f*sci4*((psc3-bn1)*ci + (psc5-bn2)*sc3);
-    real scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2 - usc5);
-#else
-    prefactor1 = 0.5f*(bn2 - psc5);
-#endif
-    ftm21 += prefactor1*((sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x));
-    ftm22 += prefactor1*((sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y));
-    ftm23 += prefactor1*((sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z));
-#endif
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2*(sci4+scip4) - (sci4*psc5+scip4*dsc5)); 
-#else
-    sci4 += scip4;
-    prefactor1 = 0.5f*sci4*(bn2 - psc5); 
-#endif
-    ftm21 += prefactor1*di1;
-    ftm22 += prefactor1*di2;
-    ftm23 += prefactor1*di3;
-#ifdef APPLY_SCALE
-    real gli1 = -ci*sci4;
-    real gli2 = -sc3*sci4;
-    real glip1 = -ci*scip4;
-    real glip2 = -sc3*scip4;
-#else
-    real gli1 = -ci*sci4;
-    real gli2 = -sc3*sci4;
-#endif
-#ifdef APPLY_SCALE
-    real gfi1 = (bn2*(gli1+glip1) + bn3*(gli2+glip2));
-    gfi1 -= (rr1*rr1)*(3*(gli1*psc3 + glip1*dsc3) + 5*(gli2*psc5 + glip2*dsc5));
-#else
-    real gfi1 = bn2*gli1 + bn3*gli2;
-    gfi1 -= (rr1*rr1)*(3*gli1*psc3 + 5*gli2*psc5);
-#endif
-    gfi1 *= 0.5f;
-    ftm21 += gfi1*xr;
-    ftm22 += gfi1*yr;
-    ftm23 += gfi1*zr;
-    {
-        real expdamp = EXP(damp);
-        real temp3 = -1.5f*damp*expdamp*rr1*rr1;
-        real temp5 = -damp;
-        real temp7 = -0.2f - 0.6f*damp;
-        real ddsc31 = temp3*xr;
-        real ddsc32 = temp3*yr;
-        real ddsc33 = temp3*zr;
-        real ddsc51 = temp5*ddsc31;
-        real ddsc52 = temp5*ddsc32;
-        real ddsc53 = temp5*ddsc33;
-        real ddsc71 = temp7*ddsc51;
-        real ddsc72 = temp7*ddsc52;
-        real ddsc73 = temp7*ddsc53;
-        real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-        temp3 = (gli1*pScale + glip1*dScale);
-        temp5 = (3*rr1*rr1)*(gli2*pScale + glip2*dScale);
-#else
-        temp3 = gli1;
-        temp5 = (3*rr1*rr1)*gli2;
-#endif
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
-    }
-//K
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-    real sc4 =  dk1*xr +  dk2*yr +  dk3*zr;
-    real ck = atom2.q;
-    real gfi2 = (-ck*bn1 + sc4*bn2);
-    prefactor1 = 0.5f*(ck*psc3 - sc4*psc5 + gfi2);
-    ftm21 += prefactor1*atom1.inducedDipole.x;
-    ftm22 += prefactor1*atom1.inducedDipole.y;
-    ftm23 += prefactor1*atom1.inducedDipole.z;
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(ck*dsc3 - sc4*dsc5 + gfi2);
-#endif
-    ftm21 += prefactor1*atom1.inducedDipolePolar.x;
-    ftm22 += prefactor1*atom1.inducedDipolePolar.y;
-    ftm23 += prefactor1*atom1.inducedDipolePolar.z;
-    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
-    energy += forceFactor*0.5f*sci3*(ck*(bn1-psc3) - sc4*(bn2-psc5));
-    real scip3 = atom1.inducedDipolePolar.x*xr + atom1.inducedDipolePolar.y*yr + atom1.inducedDipolePolar.z*zr;
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2 - usc5);
-#else
-    prefactor1 = 0.5f*(bn2 - psc5);
-#endif
-    ftm21 += prefactor1*(sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
-    ftm22 += prefactor1*(sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
-    ftm23 += prefactor1*(sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
-    real sci34;
-    sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-    sci34 = (sci3*scip4+scip3*sci4);
-#ifdef APPLY_SCALE
-    gfi1 = sci34*(usc5*(5*rr1*rr1) -bn3);
-#else
-    gfi1 = sci34*(psc5*(5*rr1*rr1) -bn3);
-#endif
-#else
-    gfi1 = 0;
-#endif
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2*(sci3+scip3) - (sci3*psc5+scip3*dsc5));
-#else
-    sci3 += scip3;
-    prefactor1 = 0.5f*sci3*(bn2 - psc5);
-#endif
-    ftm21 += prefactor1*dk1;
-    ftm22 += prefactor1*dk2;
-    ftm23 += prefactor1*dk3;
-#ifdef APPLY_SCALE
-    real gfi6 = -bn3*(sci3+scip3) + (sci3*psc7+scip3*dsc7);
-#else
-    real gfi6 = sci3*(psc7 - bn3);
-#endif
-    real sci1 = atom1.inducedDipole.x*dk1 + atom1.inducedDipole.y*dk2 + atom1.inducedDipole.z*dk3 + di1*atom2.inducedDipole.x + di2*atom2.inducedDipole.y + di3*atom2.inducedDipole.z;
-    energy += forceFactor*0.5f*(sci1*(bn1-psc3));
-    real scip1 = atom1.inducedDipolePolar.x*dk1 + atom1.inducedDipolePolar.y*dk2 + atom1.inducedDipolePolar.z*dk3 + di1*atom2.inducedDipolePolar.x + di2*atom2.inducedDipolePolar.y + di3*atom2.inducedDipolePolar.z;
-#ifndef APPLY_SCALE
-        sci1 += scip1;
-#endif
-    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x +
-                                  atom1.inducedDipole.y*atom2.inducedDipolePolar.y +
-                                  atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
-                                  atom2.inducedDipole.x*atom1.inducedDipolePolar.x +
-                                  atom2.inducedDipole.y*atom1.inducedDipolePolar.y +
-                                  atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
-           gli1 = ck*sci3 + sci1;
-           gli2 = -sci3*sc4;
-#ifdef APPLY_SCALE
-          glip1 = ck*scip3 + scip1;
-          glip2 = -scip3*sc4;
-#endif
-#ifdef APPLY_SCALE
-    gfi1 += (bn2*(gli1+glip1) + bn3*(gli2+glip2));
-    gfi1 -= (rr1*rr1)*(3*(gli1*psc3 + glip1*dsc3) + 5*(gli2*psc5 + glip2*dsc5));
-#else
-    gfi1 += (bn2*gli1 + bn3*gli2);
-    gfi1 -= (rr1*rr1)*(3*gli1*psc3 + 5*gli2*psc5);
-#endif
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    gfi1 += scip2*(bn2 - (3*rr1*rr1)*usc3);
-#else
-    gfi1 += scip2*(bn2 - (3*rr1*rr1)*psc3);
-#endif
-#endif
-    gfi1 *= 0.5f;
-    ftm21 += gfi1*xr;
-    ftm22 += gfi1*yr;
-    ftm23 += gfi1*zr;
-    {
-        real expdamp = EXP(damp);
-        real temp3 = -1.5f*damp*expdamp*rr1*rr1;
-        real temp5 = -damp;
-        real temp7 = -0.2f - 0.6f*damp;
-        real ddsc31 = temp3*xr;
-        real ddsc32 = temp3*yr;
-        real ddsc33 = temp3*zr;
-        real ddsc51 = temp5*ddsc31;
-        real ddsc52 = temp5*ddsc32;
-        real ddsc53 = temp5*ddsc33;
-        real ddsc71 = temp7*ddsc51;
-        real ddsc72 = temp7*ddsc52;
-        real ddsc73 = temp7*ddsc53;
-        real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-        temp3 = gli1*pScale + glip1*dScale;
-        temp5 = (3*rr1*rr1)*(gli2*pScale + glip2*dScale);
-#else
-        temp3 = gli1;
-        temp5 = (3*rr1*rr1)*gli2;
-#endif
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-        temp3 =  uScale*scip2;
-        temp5 = -(3*rr1*rr1)*uScale*sci34;
-#else
-        temp3 =  scip2;
-        temp5 = -(3*rr1*rr1)*sci34;
-#endif
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
-#endif
-    }
-    force.x += ftm21;
-    force.y += ftm22;
-    force.z += ftm23;
-}
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionT1(
-#else
-computeOneInteractionT1NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
-#ifdef APPLY_SCALE
-        , float dScale, float pScale, float mScale
-#endif
-        ) {
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-#ifdef APPLY_SCALE
-    real rr1 = delta.w;
-#endif
-    // set the permanent multipole and induced dipole values;
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-    real ck = atom2.q;
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-    // apply Thole polarization damping to scale factors
-#ifdef APPLY_SCALE
-    real rr2 = rr1*rr1;
-    real rr3 = rr1*rr2;
-    real rr5 = 3*rr3*rr2;
-    real rr7 = 5*rr5*rr2;
-    real rr9 = 7*rr7*rr2;
-    real scale = 1-mScale;
-    real prefactor = scale*rr3 - bn1;
-#else
-    real prefactor = -bn1;
-#endif
-    real dixdk1 = di2*dk3 - di3*dk2;
-    real ttm21 = prefactor*dixdk1;
-    real dixdk2 = di3*dk1 - di1*dk3;
-    real ttm22 = prefactor*dixdk2;
-    real dixdk3 = di1*dk2 - di2*dk1;
-    real ttm23 = prefactor*dixdk3;
-    real sc4 = dk1*xr + dk2*yr + dk3*zr;
-    real sc6 = 0;
-    real gf2 = -ck*bn1 + sc4*bn2;
-#ifdef APPLY_SCALE
-    real gfr2 = -ck*rr3 + sc4*rr5;
-    prefactor = (gf2 - scale*gfr2);
-#else
-    prefactor = gf2;
-#endif
-    ttm21 += prefactor*(di2*zr - di3*yr);
-    ttm22 += prefactor*(di3*xr - di1*zr);
-    ttm23 += prefactor*(di1*yr - di2*xr);
-    atom1.torque.x += ttm21;
-    atom1.torque.y += ttm22;
-    atom1.torque.z += ttm23;
-}
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionT2(
-#else
-computeOneInteractionT2NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
-#ifdef APPLY_SCALE
-        , float dScale, float pScale, float mScale
-#endif
-        ) {
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-    real rr1 = delta.w;
-    // set the permanent multipole and induced dipole values;
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    // apply Thole polarization damping to scale factors
-    real scale3 = 1;
-    real scale5 = 1;
-    real scale7 = 1;
-    real damp = atom1.damp*atom2.damp;
-    if (damp != 0) {
-        real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
-        real ratio = RECIP(rr1*damp);
-        damp = -pgamma*ratio*ratio*ratio;
-        real expdamp = EXP(damp);
-        scale3 = 1 - expdamp;
-        scale5 = 1 - (1-damp)*expdamp;
-        scale7 = 1 - (1-damp+0.6f*damp*damp)*expdamp;
-    }
-    real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-    real dsc3 = rr3*(1 - scale3*dScale);
-    real dsc5 = (3*rr3*rr1*rr1)* (1 - scale5*dScale);
-    real dsc7 = (15*rr3*rr3*rr1)*(1 - scale7*dScale);
-    real psc3 = rr3*(1 - scale3*pScale);
-    real psc5 = (3*rr3*rr1*rr1)*(1 - scale5*pScale);
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7*pScale);
-#else
-    real psc3 = rr3*(1 - scale3);
-    real psc5 = (3*rr3*rr1*rr1)*(1 - scale5);
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7);
-#endif
-    real prefactor1 = 0.5f*(psc3 - bn1);
-#ifdef APPLY_SCALE
-    real prefactor2 = 0.5f*(dsc3 - bn1);
-#endif
-    real dixuk1 = di2*atom2.inducedDipole.z - di3*atom2.inducedDipole.y;
-    real dixukp1 = di2*atom2.inducedDipolePolar.z - di3*atom2.inducedDipolePolar.y;
-#ifdef APPLY_SCALE
-    real ttm2i1 = prefactor1*dixuk1 + prefactor2*dixukp1;
-#else
-    real ttm2i1 = prefactor1*(dixuk1 + dixukp1);
-#endif
-    real dixuk2 = di3*atom2.inducedDipole.x - di1*atom2.inducedDipole.z;
-    real dixukp2 = di3*atom2.inducedDipolePolar.x - di1*atom2.inducedDipolePolar.z;
-#ifdef APPLY_SCALE
-    real ttm2i2 = prefactor1*dixuk2 + prefactor2*dixukp2;
-#else
-    real ttm2i2 = prefactor1*(dixuk2 + dixukp2);
-#endif
-    real dixuk3 = di1*atom2.inducedDipole.y - di2*atom2.inducedDipole.x;
-    real dixukp3 = di1*atom2.inducedDipolePolar.y - di2*atom2.inducedDipolePolar.x;
-#ifdef APPLY_SCALE
-    real ttm2i3 = prefactor1*dixuk3 + prefactor2*dixukp3;
-#else
-    real ttm2i3 = prefactor1*(dixuk3 + dixukp3);
-#endif
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    real scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-    real gti2 = bn2*(sci4+scip4);
-#ifdef APPLY_SCALE
-    real gtri2 = (sci4*psc5+scip4*dsc5);
-#else
-    real gtri2 = psc5*(sci4+scip4);
-#endif
-    prefactor1 = 0.5f*(gti2 - gtri2);
-    ttm2i1 += prefactor1*(di2*zr - di3*yr);
-    ttm2i2 += prefactor1*(di3*xr - di1*zr);
-    ttm2i3 += prefactor1*(di1*yr - di2*xr);
-    atom1.torque.x += ttm2i1;
-    atom1.torque.y += ttm2i2;
-    atom1.torque.z += ttm2i3;
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
 typedef struct {
-    real3 pos, force, torque, dipole, inducedDipole, inducedDipolePolar;
+    real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
    real q;
    float thole, damp;
 #ifdef INCLUDE_QUADRUPOLES
-    real quadrupoleXX, quadrupoleXY, quadrupoleXZ, quadrupoleYY, quadrupoleYZ;
+    real sphericalQuadrupole[5];
-    float padding;
 #endif
 } AtomData;
-__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor, float dScale, float pScale, float mScale, real3& force, real& energy);
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ sphericalDipole,
-__device__ void computeOneInteractionF2(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor, float dScale, float pScale, float mScale, real3& force, real& energy);
+            const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar,
-__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn, float dScale, float pScale, float mScale);
+            const float2* __restrict__ dampingAndThole) {
-__device__ void computeOneInteractionT2(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn, float dScale, float pScale, float mScale);
-__device__ void computeOneInteractionF1NoScale(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor, real3& force, real& energy);
-__device__ void computeOneInteractionF2NoScale(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor, real3& force, real& energy);
-__device__ void computeOneInteractionT1NoScale(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn);
-__device__ void computeOneInteractionT2NoScale(AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn);
-inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
-        const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
    real4 atomPosq = posq[atom];
    data.pos = make_real3(atomPosq.x, atomPosq.y, atomPosq.z);
    data.q = atomPosq.w;
-    data.dipole.x = labFrameDipole[atom*3];
+    data.sphericalDipole.x = sphericalDipole[atom*3];
-    data.dipole.y = labFrameDipole[atom*3+1];
+    data.sphericalDipole.y = sphericalDipole[atom*3+1];
-    data.dipole.z = labFrameDipole[atom*3+2];
+    data.sphericalDipole.z = sphericalDipole[atom*3+2];
 #ifdef INCLUDE_QUADRUPOLES
-    data.quadrupoleXX = labFrameQuadrupole[atom*5];
+    data.sphericalQuadrupole[0] = sphericalQuadrupole[atom*5];
-    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
+    data.sphericalQuadrupole[1] = sphericalQuadrupole[atom*5+1];
-    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
+    data.sphericalQuadrupole[2] = sphericalQuadrupole[atom*5+2];
-    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
+    data.sphericalQuadrupole[3] = sphericalQuadrupole[atom*5+3];
-    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
+    data.sphericalQuadrupole[4] = sphericalQuadrupole[atom*5+4];
 #endif
    data.inducedDipole.x = inducedDipole[atom*3];
    data.inducedDipole.y = inducedDipole[atom*3+1];
@@ -66,99 +57,354 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
 __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor,
                                      real& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 delta;
+    // Compute the displacement.
+    real3 delta;
    delta.x = atom2.pos.x - atom1.pos.x;
    delta.y = atom2.pos.y - atom1.pos.y;
    delta.z = atom2.pos.z - atom1.pos.z;
-    // periodic box
    APPLY_PERIODIC_TO_DELTA(delta)
-    delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-    if (delta.w > CUTOFF_SQUARED)
+    if (r2 > CUTOFF_SQUARED)
        return;
-    real r = SQRT(delta.w);
+    real rInv = RSQRT(r2);
-    real ralpha = EWALD_ALPHA*r;
+    real r = r2*rInv;
-    real alsq2 = 2*EWALD_ALPHA*EWALD_ALPHA;
+    // Rotate the various dipoles and quadrupoles.
-    real alsq2n = 0;
-    if (EWALD_ALPHA > 0)
+    real qiRotationMatrix[3][3];
-        alsq2n = RECIP(SQRT_PI*EWALD_ALPHA);
+    buildQIRotationMatrix(delta, rInv, qiRotationMatrix);
-    real exp2a = EXP(-(ralpha*ralpha));
-    real rr1 = RECIP(r);
-    delta.w = rr1;
+    real3 qiUindI = 0.5f*make_real3(qiRotationMatrix[0][1]*atom1.inducedDipole.x + qiRotationMatrix[0][2]*atom1.inducedDipole.y + qiRotationMatrix[0][0]*atom1.inducedDipole.z,
+                                    qiRotationMatrix[1][1]*atom1.inducedDipole.x + qiRotationMatrix[1][2]*atom1.inducedDipole.y + qiRotationMatrix[1][0]*atom1.inducedDipole.z,
+                                    qiRotationMatrix[2][1]*atom1.inducedDipole.x + qiRotationMatrix[2][2]*atom1.inducedDipole.y + qiRotationMatrix[2][0]*atom1.inducedDipole.z);
+    real3 qiUindJ = 0.5f*make_real3(qiRotationMatrix[0][1]*atom2.inducedDipole.x + qiRotationMatrix[0][2]*atom2.inducedDipole.y + qiRotationMatrix[0][0]*atom2.inducedDipole.z,
+                                    qiRotationMatrix[1][1]*atom2.inducedDipole.x + qiRotationMatrix[1][2]*atom2.inducedDipole.y + qiRotationMatrix[1][0]*atom2.inducedDipole.z,
+                                    qiRotationMatrix[2][1]*atom2.inducedDipole.x + qiRotationMatrix[2][2]*atom2.inducedDipole.y + qiRotationMatrix[2][0]*atom2.inducedDipole.z);
+    real3 qiUinpI = 0.5f*make_real3(qiRotationMatrix[0][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[0][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[0][0]*atom1.inducedDipolePolar.z,
+                                    qiRotationMatrix[1][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[1][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[1][0]*atom1.inducedDipolePolar.z,
+                                    qiRotationMatrix[2][1]*atom1.inducedDipolePolar.x + qiRotationMatrix[2][2]*atom1.inducedDipolePolar.y + qiRotationMatrix[2][0]*atom1.inducedDipolePolar.z);
+    real3 qiUinpJ = 0.5f*make_real3(qiRotationMatrix[0][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[0][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[0][0]*atom2.inducedDipolePolar.z,
+                                    qiRotationMatrix[1][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[1][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[1][0]*atom2.inducedDipolePolar.z,
+                                    qiRotationMatrix[2][1]*atom2.inducedDipolePolar.x + qiRotationMatrix[2][2]*atom2.inducedDipolePolar.y + qiRotationMatrix[2][0]*atom2.inducedDipolePolar.z);
+    real3 rotatedDipole1 = rotateDipole(atom1.sphericalDipole, qiRotationMatrix);
+    real3 rotatedDipole2 = rotateDipole(atom2.sphericalDipole, qiRotationMatrix);
+    real rotatedQuadrupole1[] = {0, 0, 0, 0, 0};
+    real rotatedQuadrupole2[] = {0, 0, 0, 0, 0};
+#ifdef INCLUDE_QUADRUPOLES
+    rotateQuadupoles(qiRotationMatrix, atom1.sphericalQuadrupole, atom2.sphericalQuadrupole, rotatedQuadrupole1, rotatedQuadrupole2);
+#endif    
+    // The field derivatives at I due to permanent and induced moments on J, and vice-versa.
+    // Also, their derivatives w.r.t. R, which are needed for force calculations
+    real Vij[9], Vji[9], VjiR[9], VijR[9];
+    // The field derivatives at I due to only permanent moments on J, and vice-versa.
+    real Vijp[3], Vijd[3], Vjip[3], Vjid[3];
+    real rInvVec[7], alphaRVec[8], bVec[5];
+    // The rInvVec array is defined such that the ith element is R^-i, with the
+    // dieleectric constant folded in, to avoid conversions later.
+    rInvVec[1] = rInv;
+    for (int i = 2; i < 7; ++i)
+        rInvVec[i] = rInvVec[i-1] * rInv;
+    // The alpharVec array is defined such that the ith element is (alpha R)^i,
+    // where kappa (alpha in OpenMM parlance) is the Ewald attenuation parameter.
+    real ralpha = EWALD_ALPHA*r;
+    real exp2a = EXP(-(ralpha*ralpha));
 #ifdef USE_DOUBLE_PRECISION
-    const real erfcAlphaR = erfc(ralpha);
+    const real erfAlphaR = erf(ralpha);
 #else
    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
    // error of 1.5e-7.
    const real t = RECIP(1.0f+0.3275911f*ralpha);
-    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*exp2a;
+    const real erfAlphaR = 1-(0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*exp2a;
 #endif
-    real bn0 = erfcAlphaR*rr1;
+    alphaRVec[1] = ralpha;
-    energy += forceFactor*atom1.q*atom2.q*bn0;
+    for (int i = 2; i < 8; ++i)
-    real rr2 = rr1*rr1;
+        alphaRVec[i] = alphaRVec[i-1]*ralpha;
-    alsq2n *= alsq2;
+    real X = 2*exp2a/SQRT_PI;
+    int doubleFactorial = 1, facCount = 1;
-    real4 bn;
+    real tmp = alphaRVec[1];
-    bn.x = (bn0+alsq2n*exp2a)*rr2;
+    bVec[1] = -erfAlphaR;
+    for (int i = 2; i < 5; ++i) {
-    alsq2n *= alsq2;
+        bVec[i] = bVec[i-1] + tmp * X / (real)(doubleFactorial);
-    bn.y = (3*bn.x+alsq2n*exp2a)*rr2;
+        facCount = facCount + 2;
+        doubleFactorial = doubleFactorial * facCount;
-    alsq2n *= alsq2;
+        tmp *= 2*alphaRVec[2];
-    bn.z = (5*bn.y+alsq2n*exp2a)*rr2;
+    }
-    alsq2n *= alsq2;
-    bn.w = (7*bn.z+alsq2n*exp2a)*rr2;
-    alsq2n *= alsq2;
+    real dmp = atom1.damp*atom2.damp;
-    real bn5 = (9*bn.w+alsq2n*exp2a)*rr2;
+    real a = min(atom1.thole, atom2.thole);
+    real u = fabs(dmp) > 1.0e-5f ? r/dmp : 1e10f;
+    real au3 = a*u*u*u;
+    real expau3 = au3 < 50 ? EXP(-au3) : 0;
+    real a2u6 = au3*au3;
+    real a3u9 = a2u6*au3;
+    // Thole damping factors for energies
+    real thole_c  = 1 - expau3;
+    real thole_d0 = 1 - expau3*(1 + 1.5f*au3);
+    real thole_d1 = 1 - expau3;
+    real thole_q0 = 1 - expau3*(1 + au3 + a2u6);
+    real thole_q1 = 1 - expau3*(1 + au3);
+    // Thole damping factors for derivatives
+    real dthole_c  = 1 - expau3*(1 + 1.5f*au3);
+    real dthole_d0 = 1 - expau3*(1 + au3 + 1.5f*a2u6);
+    real dthole_d1 = 1 - expau3*(1 + au3);
+    real dthole_q0 = 1 - expau3*(1 + au3 + 0.25f*a2u6 + 0.75f*a3u9);
+    real dthole_q1 = 1 - expau3*(1 + au3 + 0.75f*a2u6);
+    // Now we compute the (attenuated) Coulomb operator and its derivatives, contracted with
+    // permanent moments and induced dipoles.  Note that the coefficient of the permanent force
+    // terms is half of the expected value; this is because we compute the interaction of I with
+    // the sum of induced and permanent moments on J, as well as the interaction of J with I's
+    // permanent and induced moments; doing so double counts the permanent-permanent interaction.
+    real ePermCoef, dPermCoef, eUIndCoef, dUIndCoef, eUInpCoef, dUInpCoef;
+    // C-C terms (m=0)
+    ePermCoef = rInvVec[1]*(mScale + bVec[2] - alphaRVec[1]*X);
+    dPermCoef = -0.5f*(mScale + bVec[2])*rInvVec[2];
+    Vij[0]  = ePermCoef*atom2.q;
+    Vji[0]  = ePermCoef*atom1.q;
+    VijR[0] = dPermCoef*atom2.q;
+    VjiR[0] = dPermCoef*atom1.q;
+    // C-D and C-Uind terms (m=0)
+    ePermCoef = rInvVec[2]*(mScale + bVec[2]);
+    eUIndCoef = rInvVec[2]*(pScale*thole_c + bVec[2]);
+    eUInpCoef = rInvVec[2]*(dScale*thole_c + bVec[2]);
+    dPermCoef = -rInvVec[3]*(mScale + bVec[2] + alphaRVec[3]*X);
+    dUIndCoef = -2*rInvVec[3]*(pScale*dthole_c + bVec[2] + alphaRVec[3]*X);
+    dUInpCoef = -2*rInvVec[3]*(dScale*dthole_c + bVec[2] + alphaRVec[3]*X);
+    Vij[0]  += -(ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x);
+    Vji[1]   = -(ePermCoef*atom1.q);
+    VijR[0] += -(dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x);
+    VjiR[1]  = -(dPermCoef*atom1.q);
+    Vjip[0]  = -(eUInpCoef*atom1.q);
+    Vjid[0]  = -(eUIndCoef*atom1.q);
+    // D-C and Uind-C terms (m=0)
+    Vij[1]   = ePermCoef*atom2.q;
+    Vji[0]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1]  = dPermCoef*atom2.q;
+    VjiR[0] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0]  = eUInpCoef*atom2.q;
+    Vijd[0]  = eUIndCoef*atom2.q;
+    // D-D and D-Uind terms (m=0)
+    const real twoThirds = (real) 2/3;
+    ePermCoef = -twoThirds*rInvVec[3]*(3*(mScale + bVec[3]) + alphaRVec[3]*X);
+    eUIndCoef = -twoThirds*rInvVec[3]*(3*(pScale*thole_d0 + bVec[3]) + alphaRVec[3]*X);
+    eUInpCoef = -twoThirds*rInvVec[3]*(3*(dScale*thole_d0 + bVec[3]) + alphaRVec[3]*X);
+    dPermCoef = rInvVec[4]*(3*(mScale + bVec[3]) + 2*alphaRVec[5]*X);
+    dUIndCoef = rInvVec[4]*(6*(pScale*dthole_d0 + bVec[3]) + 4*alphaRVec[5]*X);
+    dUInpCoef = rInvVec[4]*(6*(dScale*dthole_d0 + bVec[3]) + 4*alphaRVec[5]*X);
+    Vij[1]  += ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x;
+    Vji[1]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1] += dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x;
+    VjiR[1] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0] += eUInpCoef*rotatedDipole2.x;
+    Vijd[0] += eUIndCoef*rotatedDipole2.x;
+    Vjip[0] += eUInpCoef*rotatedDipole1.x;
+    Vjid[0] += eUIndCoef*rotatedDipole1.x;
+    // D-D and D-Uind terms (m=1)
+    ePermCoef = rInvVec[3]*(mScale + bVec[3] - twoThirds*alphaRVec[3]*X);
+    eUIndCoef = rInvVec[3]*(pScale*thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+    eUInpCoef = rInvVec[3]*(dScale*thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+    dPermCoef = -1.5f*rInvVec[4]*(mScale + bVec[3]);
+    dUIndCoef = -3*rInvVec[4]*(pScale*dthole_d1 + bVec[3]);
+    dUInpCoef = -3*rInvVec[4]*(dScale*dthole_d1 + bVec[3]);
+    Vij[2]  = ePermCoef*rotatedDipole2.y + eUIndCoef*qiUindJ.y + eUInpCoef*qiUinpJ.y;
+    Vji[2]  = ePermCoef*rotatedDipole1.y + eUIndCoef*qiUindI.y + eUInpCoef*qiUinpI.y;
+    VijR[2] = dPermCoef*rotatedDipole2.y + dUIndCoef*qiUindJ.y + dUInpCoef*qiUinpJ.y;
+    VjiR[2] = dPermCoef*rotatedDipole1.y + dUIndCoef*qiUindI.y + dUInpCoef*qiUinpI.y;
+    Vij[3]  = ePermCoef*rotatedDipole2.z + eUIndCoef*qiUindJ.z + eUInpCoef*qiUinpJ.z;
+    Vji[3]  = ePermCoef*rotatedDipole1.z + eUIndCoef*qiUindI.z + eUInpCoef*qiUinpI.z;
+    VijR[3] = dPermCoef*rotatedDipole2.z + dUIndCoef*qiUindJ.z + dUInpCoef*qiUinpJ.z;
+    VjiR[3] = dPermCoef*rotatedDipole1.z + dUIndCoef*qiUindI.z + dUInpCoef*qiUinpI.z;
+    Vijp[1] = eUInpCoef*rotatedDipole2.y;
+    Vijd[1] = eUIndCoef*rotatedDipole2.y;
+    Vjip[1] = eUInpCoef*rotatedDipole1.y;
+    Vjid[1] = eUIndCoef*rotatedDipole1.y;
+    Vijp[2] = eUInpCoef*rotatedDipole2.z;
+    Vijd[2] = eUIndCoef*rotatedDipole2.z;
+    Vjip[2] = eUInpCoef*rotatedDipole1.z;
+    Vjid[2] = eUIndCoef*rotatedDipole1.z;
+    // C-Q terms (m=0)
+    ePermCoef = (mScale + bVec[3])*rInvVec[3];
+    dPermCoef = -((real) 1/3)*rInvVec[4]*(4.5f*(mScale + bVec[3]) + 2*alphaRVec[5]*X);
+    Vij[0]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]   = ePermCoef*atom1.q;
+    VijR[0] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4]  = dPermCoef*atom1.q;
+    // Q-C terms (m=0)
+    Vij[4]   = ePermCoef*atom2.q;
+    Vji[0]  += ePermCoef*rotatedQuadrupole1[0];
+    VijR[4]  = dPermCoef*atom2.q;
+    VjiR[0] += dPermCoef*rotatedQuadrupole1[0];
+    // D-Q and Uind-Q terms (m=0)
+    const real fourThirds = (real) 4/3;
+    ePermCoef = rInvVec[4]*(3*(mScale + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    eUIndCoef = rInvVec[4]*(3*(pScale*thole_q0 + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    eUInpCoef = rInvVec[4]*(3*(dScale*thole_q0 + bVec[3]) + fourThirds*alphaRVec[5]*X);
+    dPermCoef = -fourThirds*rInvVec[5]*(4.5f*(mScale + bVec[3]) + (1 + alphaRVec[2])*alphaRVec[5]*X);
+    dUIndCoef = -fourThirds*rInvVec[5]*(9*(pScale*dthole_q0 + bVec[3]) + 2*(1 + alphaRVec[2])*alphaRVec[5]*X);
+    dUInpCoef = -fourThirds*rInvVec[5]*(9*(dScale*dthole_q0 + bVec[3]) + 2*(1 + alphaRVec[2])*alphaRVec[5]*X);
+    Vij[1]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]  += ePermCoef*rotatedDipole1.x + eUIndCoef*qiUindI.x + eUInpCoef*qiUinpI.x;
+    VijR[1] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4] += dPermCoef*rotatedDipole1.x + dUIndCoef*qiUindI.x + dUInpCoef*qiUinpI.x;
+    Vijp[0] += eUInpCoef*rotatedQuadrupole2[0];
+    Vijd[0] += eUIndCoef*rotatedQuadrupole2[0];
+    // Q-D and Q-Uind terms (m=0)
+    Vij[4]  += -(ePermCoef*rotatedDipole2.x + eUIndCoef*qiUindJ.x + eUInpCoef*qiUinpJ.x);
+    Vji[1]  += -(ePermCoef*rotatedQuadrupole1[0]);
+    VijR[4] += -(dPermCoef*rotatedDipole2.x + dUIndCoef*qiUindJ.x + dUInpCoef*qiUinpJ.x);
+    VjiR[1] += -(dPermCoef*rotatedQuadrupole1[0]);
+    Vjip[0] += -(eUInpCoef*rotatedQuadrupole1[0]);
+    Vjid[0] += -(eUIndCoef*rotatedQuadrupole1[0]);
+    // D-Q and Uind-Q terms (m=1)
+    const real sqrtThree = SQRT((real) 3);
+    ePermCoef = -sqrtThree*rInvVec[4]*(mScale + bVec[3]);
+    eUIndCoef = -sqrtThree*rInvVec[4]*(pScale*thole_q1 + bVec[3]);
+    eUInpCoef = -sqrtThree*rInvVec[4]*(dScale*thole_q1 + bVec[3]);
+    const real fourSqrtOneThird = 4/sqrt((real) 3);
+    dPermCoef = fourSqrtOneThird*rInvVec[5]*(1.5f*(mScale + bVec[3]) + 0.5f*alphaRVec[5]*X);
+    dUIndCoef = fourSqrtOneThird*rInvVec[5]*(3*(pScale*dthole_q1 + bVec[3]) + alphaRVec[5]*X);
+    dUInpCoef = fourSqrtOneThird*rInvVec[5]*(3*(dScale*dthole_q1 + bVec[3]) + alphaRVec[5]*X);
+    Vij[2]  += ePermCoef*rotatedQuadrupole2[1];
+    Vji[5]   = ePermCoef*rotatedDipole1.y + eUIndCoef*qiUindI.y + eUInpCoef*qiUinpI.y;
+    VijR[2] += dPermCoef*rotatedQuadrupole2[1];
+    VjiR[5]  = dPermCoef*rotatedDipole1.y + dUIndCoef*qiUindI.y + dUInpCoef*qiUinpI.y;
+    Vij[3]  += ePermCoef*rotatedQuadrupole2[2];
+    Vji[6]   = ePermCoef*rotatedDipole1.z + eUIndCoef*qiUindI.z + eUInpCoef*qiUinpI.z;
+    VijR[3] += dPermCoef*rotatedQuadrupole2[2];
+    VjiR[6]  = dPermCoef*rotatedDipole1.z + dUIndCoef*qiUindI.z + dUInpCoef*qiUinpI.z;
+    Vijp[1] += eUInpCoef*rotatedQuadrupole2[1];
+    Vijd[1] += eUIndCoef*rotatedQuadrupole2[1];
+    Vijp[2] += eUInpCoef*rotatedQuadrupole2[2];
+    Vijd[2] += eUIndCoef*rotatedQuadrupole2[2];
+    // D-Q and Uind-Q terms (m=1)
+    Vij[5]   = -(ePermCoef*rotatedDipole2.y + eUIndCoef*qiUindJ.y + eUInpCoef*qiUinpJ.y);
+    Vji[2]  += -(ePermCoef*rotatedQuadrupole1[1]);
+    VijR[5]  = -(dPermCoef*rotatedDipole2.y + dUIndCoef*qiUindJ.y + dUInpCoef*qiUinpJ.y);
+    VjiR[2] += -(dPermCoef*rotatedQuadrupole1[1]);
+    Vij[6]   = -(ePermCoef*rotatedDipole2.z + eUIndCoef*qiUindJ.z + eUInpCoef*qiUinpJ.z);
+    Vji[3]  += -(ePermCoef*rotatedQuadrupole1[2]);
+    VijR[6]  = -(dPermCoef*rotatedDipole2.z + dUIndCoef*qiUindJ.z + dUInpCoef*qiUinpJ.z);
+    VjiR[3] += -(dPermCoef*rotatedQuadrupole1[2]);
+    Vjip[1] += -(eUInpCoef*rotatedQuadrupole1[1]);
+    Vjid[1] += -(eUIndCoef*rotatedQuadrupole1[1]);
+    Vjip[2] += -(eUInpCoef*rotatedQuadrupole1[2]);
+    Vjid[2] += -(eUIndCoef*rotatedQuadrupole1[2]);
+    // Q-Q terms (m=0)
+    ePermCoef = rInvVec[5]*(6*(mScale + bVec[4]) + ((real) 4/45)*(-3 + 10*alphaRVec[2])*alphaRVec[5]*X);
+    dPermCoef = -rInvVec[6]*(135*(mScale + bVec[4]) + 4*(1 + 2*alphaRVec[2])*alphaRVec[7]*X)/9;
+    Vij[4]  += ePermCoef*rotatedQuadrupole2[0];
+    Vji[4]  += ePermCoef*rotatedQuadrupole1[0];
+    VijR[4] += dPermCoef*rotatedQuadrupole2[0];
+    VjiR[4] += dPermCoef*rotatedQuadrupole1[0];
+    // Q-Q terms (m=1)
+    const real fourOverFifteen = (real) 4/15;
+    ePermCoef = -fourOverFifteen*rInvVec[5]*(15*(mScale + bVec[4]) + alphaRVec[5]*X);
+    dPermCoef = rInvVec[6]*(10*(mScale + bVec[4]) + fourThirds*alphaRVec[7]*X);
+    Vij[5]  += ePermCoef*rotatedQuadrupole2[1];
+    Vji[5]  += ePermCoef*rotatedQuadrupole1[1];
+    VijR[5] += dPermCoef*rotatedQuadrupole2[1];
+    VjiR[5] += dPermCoef*rotatedQuadrupole1[1];
+    Vij[6]  += ePermCoef*rotatedQuadrupole2[2];
+    Vji[6]  += ePermCoef*rotatedQuadrupole1[2];
+    VijR[6] += dPermCoef*rotatedQuadrupole2[2];
+    VjiR[6] += dPermCoef*rotatedQuadrupole1[2];
+    // Q-Q terms (m=2)
+    ePermCoef = rInvVec[5]*(mScale + bVec[4] - fourOverFifteen*alphaRVec[5]*X);
+    dPermCoef = -2.5f*(mScale + bVec[4])*rInvVec[6];
+    Vij[7]  = ePermCoef*rotatedQuadrupole2[3];
+    Vji[7]  = ePermCoef*rotatedQuadrupole1[3];
+    VijR[7] = dPermCoef*rotatedQuadrupole2[3];
+    VjiR[7] = dPermCoef*rotatedQuadrupole1[3];
+    Vij[8]  = ePermCoef*rotatedQuadrupole2[4];
+    Vji[8]  = ePermCoef*rotatedQuadrupole1[4];
+    VijR[8] = dPermCoef*rotatedQuadrupole2[4];
+    VjiR[8] = dPermCoef*rotatedQuadrupole1[4];
+    // Evaluate the energies, forces and torques due to permanent+induced moments
+    // interacting with just the permanent moments.
+    energy += forceFactor*0.5f*(
+        atom1.q*Vij[0] + rotatedDipole1.x*Vij[1] + rotatedDipole1.y*Vij[2] + rotatedDipole1.z*Vij[3] + rotatedQuadrupole1[0]*Vij[4] + rotatedQuadrupole1[1]*Vij[5] + rotatedQuadrupole1[2]*Vij[6] + rotatedQuadrupole1[3]*Vij[7] + rotatedQuadrupole1[4]*Vij[8] +
+        atom2.q*Vji[0] + rotatedDipole2.x*Vji[1] + rotatedDipole2.y*Vji[2] + rotatedDipole2.z*Vji[3] + rotatedQuadrupole2[0]*Vji[4] + rotatedQuadrupole2[1]*Vji[5] + rotatedQuadrupole2[2]*Vji[6] + rotatedQuadrupole2[3]*Vji[7] + rotatedQuadrupole2[4]*Vji[8]);
+    real fIZ = atom1.q*VijR[0] + rotatedDipole1.x*VijR[1] + rotatedDipole1.y*VijR[2] + rotatedDipole1.z*VijR[3] + rotatedQuadrupole1[0]*VijR[4] + rotatedQuadrupole1[1]*VijR[5] + rotatedQuadrupole1[2]*VijR[6] + rotatedQuadrupole1[3]*VijR[7] + rotatedQuadrupole1[4]*VijR[8];
+    real fJZ = atom2.q*VjiR[0] + rotatedDipole2.x*VjiR[1] + rotatedDipole2.y*VjiR[2] + rotatedDipole2.z*VjiR[3] + rotatedQuadrupole2[0]*VjiR[4] + rotatedQuadrupole2[1]*VjiR[5] + rotatedQuadrupole2[2]*VjiR[6] + rotatedQuadrupole2[3]*VjiR[7] + rotatedQuadrupole2[4]*VjiR[8];
+    real EIX = rotatedDipole1.z*Vij[1] - rotatedDipole1.x*Vij[3] + sqrtThree*rotatedQuadrupole1[2]*Vij[4] + rotatedQuadrupole1[4]*Vij[5] - (sqrtThree*rotatedQuadrupole1[0]+rotatedQuadrupole1[3])*Vij[6] + rotatedQuadrupole1[2]*Vij[7] - rotatedQuadrupole1[1]*Vij[8];
+    real EIY = -rotatedDipole1.y*Vij[1] + rotatedDipole1.x*Vij[2] - sqrtThree*rotatedQuadrupole1[1]*Vij[4] + (sqrtThree*rotatedQuadrupole1[0]-rotatedQuadrupole1[3])*Vij[5] - rotatedQuadrupole1[4]*Vij[6] + rotatedQuadrupole1[1]*Vij[7] + rotatedQuadrupole1[2]*Vij[8];
+    real EIZ = -rotatedDipole1.z*Vij[2] + rotatedDipole1.y*Vij[3] - rotatedQuadrupole1[2]*Vij[5] + rotatedQuadrupole1[1]*Vij[6] - 2*rotatedQuadrupole1[4]*Vij[7] + 2*rotatedQuadrupole1[3]*Vij[8];
+    real EJX = rotatedDipole2.z*Vji[1] - rotatedDipole2.x*Vji[3] + sqrtThree*rotatedQuadrupole2[2]*Vji[4] + rotatedQuadrupole2[4]*Vji[5] - (sqrtThree*rotatedQuadrupole2[0]+rotatedQuadrupole2[3])*Vji[6] + rotatedQuadrupole2[2]*Vji[7] - rotatedQuadrupole2[1]*Vji[8];
+    real EJY = -rotatedDipole2.y*Vji[1] + rotatedDipole2.x*Vji[2] - sqrtThree*rotatedQuadrupole2[1]*Vji[4] + (sqrtThree*rotatedQuadrupole2[0]-rotatedQuadrupole2[3])*Vji[5] - rotatedQuadrupole2[4]*Vji[6] + rotatedQuadrupole2[1]*Vji[7] + rotatedQuadrupole2[2]*Vji[8];
+    real EJZ = -rotatedDipole2.z*Vji[2] + rotatedDipole2.y*Vji[3] - rotatedQuadrupole2[2]*Vji[5] + rotatedQuadrupole2[1]*Vji[6] - 2*rotatedQuadrupole2[4]*Vji[7] + 2*rotatedQuadrupole2[3]*Vji[8];
+    // Define the torque intermediates for the induced dipoles. These are simply the induced dipole torque
+    // intermediates dotted with the field due to permanent moments only, at each center. We inline the
+    // induced dipole torque intermediates here, for simplicity. N.B. There are no torques on the dipoles
+    // themselves, so we accumulate the torque intermediates into separate variables to allow them to be
+    // used only in the force calculation.
+    //
+    // The torque about the x axis (needed to obtain the y force on the induced dipoles, below)
+    //    qiUindIx[0] = qiQUindI[2];    qiUindIx[1] = 0;    qiUindIx[2] = -qiQUindI[0]
+    real iEIX = qiUinpI.z*Vijp[0] + qiUindI.z*Vijd[0] - qiUinpI.x*Vijp[2] - qiUindI.x*Vijd[2];
+    real iEJX = qiUinpJ.z*Vjip[0] + qiUindJ.z*Vjid[0] - qiUinpJ.x*Vjip[2] - qiUindJ.x*Vjid[2];
+    // The torque about the y axis (needed to obtain the x force on the induced dipoles, below)
+    //    qiUindIy[0] = -qiQUindI[1];   qiUindIy[1] = qiQUindI[0];    qiUindIy[2] = 0
+    real iEIY = qiUinpI.x*Vijp[1] + qiUindI.x*Vijd[1] - qiUinpI.y*Vijp[0] - qiUindI.y*Vijd[0];
+    real iEJY = qiUinpJ.x*Vjip[1] + qiUindJ.x*Vjid[1] - qiUinpJ.y*Vjip[0] - qiUindJ.y*Vjid[0];
+#ifdef USE_MUTUAL_POLARIZATION
+    // Uind-Uind terms (m=0)
+    real eCoef = -fourThirds*rInvVec[3]*(3*(thole_d0 + bVec[3]) + alphaRVec[3]*X);
+    real dCoef = rInvVec[4]*(6*(dthole_d0 + bVec[3]) + 4*alphaRVec[5]*X);
+    iEIX += eCoef*(qiUinpI.z*qiUindJ.x + qiUindI.z*qiUinpJ.x);
+    iEJX += eCoef*(qiUinpJ.z*qiUindI.x + qiUindJ.z*qiUinpI.x);
+    iEIY -= eCoef*(qiUinpI.y*qiUindJ.x + qiUindI.y*qiUinpJ.x);
+    iEJY -= eCoef*(qiUinpJ.y*qiUindI.x + qiUindJ.y*qiUinpI.x);
+    fIZ += dCoef*(qiUinpI.x*qiUindJ.x + qiUindI.x*qiUinpJ.x);
+    fIZ += dCoef*(qiUinpJ.x*qiUindI.x + qiUindJ.x*qiUinpI.x);
+    // Uind-Uind terms (m=1)
+    eCoef = 2*rInvVec[3]*(thole_d1 + bVec[3] - twoThirds*alphaRVec[3]*X);
+    dCoef = -3*rInvVec[4]*(dthole_d1 + bVec[3]);
+    iEIX -= eCoef*(qiUinpI.x*qiUindJ.z + qiUindI.x*qiUinpJ.z);
+    iEJX -= eCoef*(qiUinpJ.x*qiUindI.z + qiUindJ.x*qiUinpI.z);
+    iEIY += eCoef*(qiUinpI.x*qiUindJ.y + qiUindI.x*qiUinpJ.y);
+    iEJY += eCoef*(qiUinpJ.x*qiUindI.y + qiUindJ.x*qiUinpI.y);
+    fIZ += dCoef*(qiUinpI.y*qiUindJ.y + qiUindI.y*qiUinpJ.y + qiUinpI.z*qiUindJ.z + qiUindI.z*qiUinpJ.z);
+    fIZ += dCoef*(qiUinpJ.y*qiUindI.y + qiUindJ.y*qiUinpI.y + qiUinpJ.z*qiUindI.z + qiUindJ.z*qiUinpI.z);
+#endif
-    real3 force;
+    // The quasi-internal frame forces and torques.  Note that the induced torque intermediates are
+    // used in the force expression, but not in the torques; the induced dipoles are isotropic.
+    real qiForce[3] = {rInv*(EIY+EJY+iEIY+iEJY), -rInv*(EIX+EJX+iEIX+iEJX), -(fJZ+fIZ)};
+    real qiTorqueI[3] = {-EIX, -EIY, -EIZ};
+    real qiTorqueJ[3] = {-EJX, -EJY, -EJZ};
-    if (hasExclusions) {
-        computeOneInteractionF1(atom1, atom2, delta, bn, bn5, forceFactor, dScale, pScale, mScale, force, energy);
-        computeOneInteractionF2(atom1, atom2, delta, bn, forceFactor, dScale, pScale, mScale, force, energy);
-    }
-    else {
-        computeOneInteractionF1NoScale(atom1, atom2, delta, bn, bn5, forceFactor, force, energy);
-        computeOneInteractionF2NoScale(atom1, atom2, delta, bn, forceFactor, force, energy);
-    }
+    real3 force = make_real3(qiRotationMatrix[1][1]*qiForce[0] + qiRotationMatrix[2][1]*qiForce[1] + qiRotationMatrix[0][1]*qiForce[2],
+                             qiRotationMatrix[1][2]*qiForce[0] + qiRotationMatrix[2][2]*qiForce[1] + qiRotationMatrix[0][2]*qiForce[2],
+                             qiRotationMatrix[1][0]*qiForce[0] + qiRotationMatrix[2][0]*qiForce[1] + qiRotationMatrix[0][0]*qiForce[2]);
    atom1.force += force;
-    if (forceFactor == 1)
+    atom1.torque += make_real3(qiRotationMatrix[1][1]*qiTorqueI[0] + qiRotationMatrix[2][1]*qiTorqueI[1] + qiRotationMatrix[0][1]*qiTorqueI[2],
-        atom2.force -= force;
+                               qiRotationMatrix[1][2]*qiTorqueI[0] + qiRotationMatrix[2][2]*qiTorqueI[1] + qiRotationMatrix[0][2]*qiTorqueI[2],
+                               qiRotationMatrix[1][0]*qiTorqueI[0] + qiRotationMatrix[2][0]*qiTorqueI[1] + qiRotationMatrix[0][0]*qiTorqueI[2]);
-    if (hasExclusions) {
-        computeOneInteractionT1(atom1, atom2, delta, bn, dScale, pScale, mScale);
-        computeOneInteractionT2(atom1, atom2, delta, bn, dScale, pScale, mScale);
-    }
-    else {
-        computeOneInteractionT1NoScale(atom1, atom2, delta, bn);
-        computeOneInteractionT2NoScale(atom1, atom2, delta, bn);
-    }
    if (forceFactor == 1) {
-        // T3 == T1 w/ particles I and J reversed
+        atom2.force -= force;
-        // T4 == T2 w/ particles I and J reversed
+        atom2.torque += make_real3(qiRotationMatrix[1][1]*qiTorqueJ[0] + qiRotationMatrix[2][1]*qiTorqueJ[1] + qiRotationMatrix[0][1]*qiTorqueJ[2],
+                                   qiRotationMatrix[1][2]*qiTorqueJ[0] + qiRotationMatrix[2][2]*qiTorqueJ[1] + qiRotationMatrix[0][2]*qiTorqueJ[2],
-        delta.x = -delta.x;
+                                   qiRotationMatrix[1][0]*qiTorqueJ[0] + qiRotationMatrix[2][0]*qiTorqueJ[1] + qiRotationMatrix[0][0]*qiTorqueJ[2]);
-        delta.y = -delta.y;
-        delta.z = -delta.z;
-        if (hasExclusions) {
-            computeOneInteractionT1(atom2, atom1, delta, bn, dScale, pScale, mScale);
-            computeOneInteractionT2(atom2, atom1, delta, bn, dScale, pScale, mScale);
-        }
-        else {
-            computeOneInteractionT1NoScale(atom2, atom1, delta, bn);
-            computeOneInteractionT2NoScale(atom2, atom1, delta, bn);
-        }
    }
 }
@@ -166,30 +412,27 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
 * Compute the self energy and self torque.
 */
 __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
-    real term = 2*EWALD_ALPHA*EWALD_ALPHA;
-    real fterm = -EWALD_ALPHA/SQRT_PI;
    real cii = atom1.q*atom1.q;
-    real dii = dot(atom1.dipole, atom1.dipole);
+    real3 dipole = make_real3(atom1.sphericalDipole.y, atom1.sphericalDipole.z, atom1.sphericalDipole.x);
+    real dii = dot(dipole, dipole+atom1.inducedDipole);
 #ifdef INCLUDE_QUADRUPOLES
-    real qii = 2*(atom1.quadrupoleXX*atom1.quadrupoleXX +
+    real qii = (atom1.sphericalQuadrupole[0]*atom1.sphericalQuadrupole[0] +
-                  atom1.quadrupoleYY*atom1.quadrupoleYY +
+                atom1.sphericalQuadrupole[1]*atom1.sphericalQuadrupole[1] +
-                  atom1.quadrupoleXX*atom1.quadrupoleYY +
+                atom1.sphericalQuadrupole[2]*atom1.sphericalQuadrupole[2] +
-                  atom1.quadrupoleXY*atom1.quadrupoleXY +
+                atom1.sphericalQuadrupole[3]*atom1.sphericalQuadrupole[3] +
-                  atom1.quadrupoleXZ*atom1.quadrupoleXZ +
+                atom1.sphericalQuadrupole[4]*atom1.sphericalQuadrupole[4]);
-                  atom1.quadrupoleYZ*atom1.quadrupoleYZ);
 #else
    real qii = 0;
 #endif
-    real uii = dot(atom1.dipole, atom1.inducedDipole);
+    real prefac = -EWALD_ALPHA/SQRT_PI;
-    real selfEnergy = (cii + term*(dii/3 + 2*term*qii/5));
+    real a2 = EWALD_ALPHA*EWALD_ALPHA;
-    selfEnergy += term*uii/3;
+    real a4 = a2*a2;
-    selfEnergy *= fterm;
+    energy += prefac*(cii + ((real)2/3)*a2*dii + ((real) 4/15)*a4*qii);
-    energy += selfEnergy;
    // self-torque for PME
    real3 ui = atom1.inducedDipole+atom1.inducedDipolePolar;
-    atom1.torque += ((2/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI)*cross(atom1.dipole, ui);
+    atom1.torque += ((2/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI)*cross(dipole, ui);
 }
 /**
@@ -204,7 +447,7 @@ extern "C" __global__ void computeElectrostatics(
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const unsigned int* __restrict__ interactingAtoms,
 #endif
-        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
+        const real* __restrict__ sphericalDipole, const real* __restrict__ sphericalQuadrupole, const real* __restrict__ inducedDipole,
        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
@@ -223,7 +466,7 @@ extern "C" __global__ void computeElectrostatics(
        const unsigned int y = tileIndices.y;
        AtomData data;
        unsigned int atom1 = x*TILE_SIZE + tgx;
-        loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+        loadAtomData(data, atom1, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
        data.force = make_real3(0);
        data.torque = make_real3(0);
        uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
@@ -233,13 +476,13 @@ extern "C" __global__ void computeElectrostatics(
            localData[threadIdx.x].pos = data.pos;
            localData[threadIdx.x].q = data.q;
-            localData[threadIdx.x].dipole = data.dipole;
+            localData[threadIdx.x].sphericalDipole = data.sphericalDipole;
 #ifdef INCLUDE_QUADRUPOLES
-            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
+            localData[threadIdx.x].sphericalQuadrupole[0] = data.sphericalQuadrupole[0];
-            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
+            localData[threadIdx.x].sphericalQuadrupole[1] = data.sphericalQuadrupole[1];
-            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
+            localData[threadIdx.x].sphericalQuadrupole[2] = data.sphericalQuadrupole[2];
-            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
+            localData[threadIdx.x].sphericalQuadrupole[3] = data.sphericalQuadrupole[3];
-            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+            localData[threadIdx.x].sphericalQuadrupole[4] = data.sphericalQuadrupole[4];
 #endif
            localData[threadIdx.x].inducedDipole = data.inducedDipole;
            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
@@ -272,7 +515,7 @@ extern "C" __global__ void computeElectrostatics(
            // This is an off-diagonal tile.
            unsigned int j = y*TILE_SIZE + tgx;
-            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], j, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            localData[threadIdx.x].force = make_real3(0);
            localData[threadIdx.x].torque = make_real3(0);
            unsigned int tj = tgx;
@@ -366,7 +609,7 @@ extern "C" __global__ void computeElectrostatics(
            // Load atom data for this tile.
            AtomData data;
-            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(data, atom1, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            data.force = make_real3(0);
            data.torque = make_real3(0);
 #ifdef USE_CUTOFF
@@ -375,7 +618,7 @@ extern "C" __global__ void computeElectrostatics(
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
            atomIndices[threadIdx.x] = j;
-            loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            loadAtomData(localData[threadIdx.x], j, posq, sphericalDipole, sphericalQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
            localData[threadIdx.x].force = make_real3(0);
            localData[threadIdx.x].torque = make_real3(0);