Python 2/3 compatibility in single code base, plus python 3 testing on travis.

b7088b74 · peastman · Robert McGibbon · 4c00b312 · b7088b74 · b7088b74
Commit b7088b74 authored Aug 10, 2015 by peastman Committed by Robert McGibbon Aug 27, 2015
20 changed files
--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -220,16 +220,18 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
                params.push_back(mm_float2(dist13, dist12));
            }
            else
-                throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
+                continue; // We can't handle this with SETTLE
            isShakeAtom[atom1] = true;
            isShakeAtom[atom2] = true;
            isShakeAtom[atom3] = true;
        }
+        if (atoms.size() > 0) {
            settleAtoms = OpenCLArray::create<mm_int4>(context, atoms.size(), "settleAtoms");
            settleParams = OpenCLArray::create<mm_float2>(context, params.size(), "settleParams");
            settleAtoms->upload(atoms);
            settleParams->upload(params);
        }
+    }

    // Find clusters consisting of a central atom with up to three peripheral atoms.


--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -341,6 +341,8 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {

    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
+    bool rebuild = false;
+    do {
        setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
        context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
        blockSorter->sort(*sortedBlocks);
@@ -349,6 +351,9 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
        setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
        context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
        forceRebuildNeighborList = false;
+        if (context.getComputeForceCount() == 1)
+            rebuild = updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
+    } while (rebuild);
    lastCutoff = kernels.cutoffDistance;
 }

@@ -360,18 +365,16 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups) {
        if (useCutoff)
            setPeriodicBoxArgs(context, kernels.forceKernel, 9);
        context.executeKernel(kernels.forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
-        if (context.getComputeForceCount() == 1)
-            updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
    }
 }

-void OpenCLNonbondedUtilities::updateNeighborListSize() {
+bool OpenCLNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
-        return;
+        return false;
    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
    interactionCount->download(pinnedInteractionCount);
    if (pinnedInteractionCount[0] <= (unsigned int) interactingTiles->getSize())
-        return;
+        return false;

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.
@@ -395,6 +398,7 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
        iter->second.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
    }
    forceRebuildNeighborList = true;
+    return true;
 }

 void OpenCLNonbondedUtilities::setUsePadding(bool padding) {

--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -126,7 +126,7 @@ __kernel void computeNonbonded(
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
+                if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                    real invR = RSQRT(r2);
                    real r = r2*invR;
@@ -300,7 +300,7 @@ __kernel void computeNonbonded(
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -352,7 +352,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -70,7 +70,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -138,7 +138,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -304,7 +304,7 @@ __kernel void computeNonbonded(
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
                        real r2 = dot(delta.xyz, delta.xyz);
-                        if (r2 < CUTOFF_SQUARED) {
+                        if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
                            real invR = RSQRT(r2);
                            real r = r2*invR;
                            unsigned int atom2 = j;
@@ -367,7 +367,7 @@ __kernel void computeNonbonded(
 #endif
                        real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
+                        if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                            real invR = RSQRT(r2);
                            real r = r2*invR;

--- a/platforms/opencl/staticTarget/CMakeLists.txt
+++ b/platforms/opencl/staticTarget/CMakeLists.txt
@@ -15,6 +15,6 @@ ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${AP

 TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}  ${OPENCL_LIBRARIES} ${PTHREADS_LIB_STATIC})
 #-DPTW32_STATIC_LIB only works for the windows pthreads.
-SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_OPENCL_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_OPENCL_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/platforms/opencl/tests/CMakeLists.txt
+++ b/platforms/opencl/tests/CMakeLists.txt
@@ -25,7 +25,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")

    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
    IF (OPENMM_BUILD_OPENCL_DOUBLE_PRECISION_TESTS)

--- a/platforms/reference/src/SimTKReference/ReferenceConstraints.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceConstraints.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -98,14 +98,13 @@ ReferenceConstraints::ReferenceConstraints(const System& system) : ccma(NULL), s
    // Record the SETTLE clusters.

    vector<bool> isSettleAtom(numParticles, false);
-    int numSETTLE = settleClusters.size();
-    if (numSETTLE > 0) {
-        vector<int> atom1(numSETTLE);
-        vector<int> atom2(numSETTLE);
-        vector<int> atom3(numSETTLE);
-        vector<RealOpenMM> distance1(numSETTLE);
-        vector<RealOpenMM> distance2(numSETTLE);
-        for (int i = 0; i < numSETTLE; i++) {
+    if (settleClusters.size() > 0) {
+        vector<int> atom1;
+        vector<int> atom2;
+        vector<int> atom3;
+        vector<RealOpenMM> distance1;
+        vector<RealOpenMM> distance2;
+        for (int i = 0; i < settleClusters.size(); i++) {
            int p1 = settleClusters[i];
            int p2 = settleConstraints[p1].begin()->first;
            int p3 = (++settleConstraints[p1].begin())->first;
@@ -114,34 +113,35 @@ ReferenceConstraints::ReferenceConstraints(const System& system) : ccma(NULL), s
            float dist23 = settleConstraints[p2].find(p3)->second;
            if (dist12 == dist13) {
                // p1 is the central atom
-                atom1[i] = p1;
-                atom2[i] = p2;
-                atom3[i] = p3;
-                distance1[i] = dist12;
-                distance2[i] = dist23;
+                atom1.push_back(p1);
+                atom2.push_back(p2);
+                atom3.push_back(p3);
+                distance1.push_back(dist12);
+                distance2.push_back(dist23);
            }
            else if (dist12 == dist23) {
                // p2 is the central atom
-                atom1[i] = p2;
-                atom2[i] = p1;
-                atom3[i] = p3;
-                distance1[i] = dist12;
-                distance2[i] = dist13;
+                atom1.push_back(p2);
+                atom2.push_back(p1);
+                atom3.push_back(p3);
+                distance1.push_back(dist12);
+                distance2.push_back(dist13);
            }
            else if (dist13 == dist23) {
                // p3 is the central atom
-                atom1[i] = p3;
-                atom2[i] = p1;
-                atom3[i] = p2;
-                distance1[i] = dist13;
-                distance2[i] = dist12;
+                atom1.push_back(p3);
+                atom2.push_back(p1);
+                atom3.push_back(p2);
+                distance1.push_back(dist13);
+                distance2.push_back(dist12);
            }
            else
-                throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
+                continue; // We can't handle this with SETTLE
            isSettleAtom[p1] = true;
            isSettleAtom[p2] = true;
            isSettleAtom[p3] = true;
        }
+        if (atom1.size() > 0)
            settle = new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, masses);
    }


--- a/platforms/reference/tests/CMakeLists.txt
+++ b/platforms/reference/tests/CMakeLists.txt
@@ -15,7 +15,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    ELSE (OPENMM_BUILD_SHARED_LIB)
        TARGET_LINK_LIBRARIES(${TEST_ROOT} ${STATIC_TARGET})
    ENDIF (OPENMM_BUILD_SHARED_LIB)
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})

 ENDFOREACH(TEST_PROG ${TEST_PROGS})

--- a/plugins/amoeba/CMakeLists.txt
+++ b/plugins/amoeba/CMakeLists.txt
@@ -86,14 +86,14 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)

 ADD_LIBRARY(${SHARED_AMOEBA_TARGET} SHARED ${SOURCE_AMOEBA_FILES} ${SOURCE_AMOEBA_INCLUDE_FILES} ${API_AMOEBA_ABS_INCLUDE_FILES})
-SET_TARGET_PROPERTIES(${SHARED_AMOEBA_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_AMOEBA_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_AMOEBA_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_AMOEBA_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY")

 FILE(GLOB serialization_files  ${CMAKE_CUURENT_SOURCE_DIR}/serialization/src/*.cpp)
 SET_SOURCE_FILES_PROPERTIES(${serialization_files} PROPERTIES COMPILE_FLAGS "-DOPENMM_AMOEBA_BUILDING_SHARED_LIBRARY -DTIXML_USE_STL")

 IF(OPENMM_BUILD_STATIC_LIB)
  ADD_LIBRARY(${STATIC_AMOEBA_TARGET} STATIC ${SOURCE_AMOEBA_FILES} ${SOURCE_AMOEBA_INCLUDE_FILES} ${API_AMOEBA_ABS_INCLUDE_FILES})
-  SET_TARGET_PROPERTIES(${STATIC_AMOEBA_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_AMOEBA_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY")
+  SET_TARGET_PROPERTIES(${STATIC_AMOEBA_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_AMOEBA_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STATIC_LIBRARIES -DOPENMM_BUILDING_STATIC_LIBRARY -DLEPTON_USE_STATIC_LIBRARIES -DLEPTON_BUILDING_STATIC_LIBRARY")
 ENDIF(OPENMM_BUILD_STATIC_LIB)

 IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)

--- a/plugins/amoeba/platforms/cuda/CMakeLists.txt
+++ b/plugins/amoeba/platforms/cuda/CMakeLists.txt
@@ -94,7 +94,7 @@ SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE
 IF (APPLE)
    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
 ELSE (APPLE)
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
 ENDIF (APPLE)

 INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -806,8 +806,8 @@ private:

 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false),
-        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), fracDipoles(NULL),
-        fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
+        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL),
+        fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
        diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
        prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
@@ -826,6 +826,10 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete labFrameDipoles;
    if (labFrameQuadrupoles != NULL)
        delete labFrameQuadrupoles;
+    if (sphericalDipoles != NULL)
+        delete sphericalDipoles;
+    if (sphericalQuadrupoles != NULL)
+        delete sphericalQuadrupoles;
    if (fracDipoles != NULL)
        delete fracDipoles;
    if (fracQuadrupoles != NULL)
@@ -985,6 +989,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
    labFrameDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles");
    labFrameQuadrupoles = new CudaArray(cu, 5*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
+    sphericalDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "sphericalDipoles");
+    sphericalQuadrupoles = new CudaArray(cu, 5*paddedNumAtoms, elementSize, "sphericalQuadrupoles");
    fracDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "fracDipoles");
    fracQuadrupoles = new CudaArray(cu, 6*paddedNumAtoms, elementSize, "fracQuadrupoles");
    field = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "field");
@@ -1144,6 +1150,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    if (maxInducedIterations > 0) {
        defines["THREAD_BLOCK_SIZE"] = cu.intToString(inducedFieldThreads);
        defines["MAX_PREV_DIIS_DIPOLES"] = cu.intToString(MaxPrevDIISDipoles);
+        defines["USE_MUTUAL_POLARIZATION"] = "1";
        module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
        computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
@@ -1151,33 +1158,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
    }
    stringstream electrostaticsSource;
-    if (usePME) {
    electrostaticsSource << CudaKernelSources::vectorOps;
+    electrostaticsSource << CudaAmoebaKernelSources::sphericalMultipoles;
+    if (usePME)
        electrostaticsSource << CudaAmoebaKernelSources::pmeMultipoleElectrostatics;
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::pmeElectrostaticPairForce : CudaAmoebaKernelSources::pmeElectrostaticPairForceNoQuadrupoles);
-        electrostaticsSource << "#define APPLY_SCALE\n";
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::pmeElectrostaticPairForce : CudaAmoebaKernelSources::pmeElectrostaticPairForceNoQuadrupoles);
-        electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
-        if (!useShuffle)
-            electrostaticsThreadMemory += 3*elementSize;
-    }
-    else {
-        electrostaticsSource << CudaKernelSources::vectorOps;
+    else
        electrostaticsSource << CudaAmoebaKernelSources::multipoleElectrostatics;
-        electrostaticsSource << "#define F1\n";
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
-        electrostaticsSource << "#undef F1\n";
-        electrostaticsSource << "#define T1\n";
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
-        electrostaticsSource << "#undef T1\n";
-        electrostaticsSource << "#define T3\n";
-        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
-        electrostaticsThreadMemory = 21*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
-        if (!useShuffle)
-            electrostaticsThreadMemory += 3*elementSize;
-        if (gk != NULL)
-            electrostaticsThreadMemory += 4*elementSize;
-    }
+    electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
    electrostaticsThreads = min(maxThreads, cu.computeThreadBlockSize(electrostaticsThreadMemory));
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(electrostaticsThreads);
    module = cu.createModule(electrostaticsSource.str(), defines);
@@ -1433,7 +1420,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in

    void* computeMomentsArgs[] = {&cu.getPosq().getDevicePointer(), &multipoleParticles->getDevicePointer(),
        &molecularDipoles->getDevicePointer(), &molecularQuadrupoles->getDevicePointer(),
-        &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer()};
+        &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
+        &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer()};
    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getNumAtoms());
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
@@ -1497,8 +1485,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
            &cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(),
            &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
-            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
-            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+            &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
+            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
        if (gkKernel != NULL)
            gkKernel->finishComputation(*torque, *labFrameDipoles, *labFrameQuadrupoles, *inducedDipole, *inducedDipolePolar, *dampingAndThole, *covalentFlags, *polarizationGroupFlags);
@@ -1652,8 +1640,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(),
            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
-            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
-            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+            &sphericalDipoles->getDevicePointer(), &sphericalQuadrupoles->getDevicePointer(),
+            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
        void* pmeTransformInducedPotentialArgs[] = {&pmePhidp->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformInducedPotentialArgs, cu.getNumAtoms());

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -392,6 +392,8 @@ private:
    CudaArray* molecularQuadrupoles;
    CudaArray* labFrameDipoles;
    CudaArray* labFrameQuadrupoles;
+    CudaArray* sphericalDipoles;
+    CudaArray* sphericalQuadrupoles;
    CudaArray* fracDipoles;
    CudaArray* fracQuadrupoles;
    CudaArray* field;

--- a/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForce.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForceNoQuadrupoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForceNoQuadrupoles.cu
-/**
- * This defines three different closely related functions, depending on which constant (F1, T1, or T3) is defined.
- */
-
-#if defined F1
-__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real& energy, real3& outputForce) {
-#elif defined T1
-__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
-#else
-__device__ void computeOneInteractionT3(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
-#endif
-    
-#ifdef F1
-    const float uScale = 1;
-    real ddsc3_0 = 0;
-    real ddsc3_1 = 0;
-    real ddsc3_2 = 0;
-
-    real ddsc5_0 = 0;
-    real ddsc5_1 = 0;
-    real ddsc5_2 = 0;
-
-    real ddsc7_0 = 0;
-    real ddsc7_1 = 0;
-    real ddsc7_2 = 0;
-#endif
-
-    real xr = atom2.posq.x - atom1.posq.x;
-    real yr = atom2.posq.y - atom1.posq.y;
-    real zr = atom2.posq.z - atom1.posq.z;
-    
-    real r2 = xr*xr + yr*yr + zr*zr;
-    real r = SQRT(r2);
-    real rr1 = RECIP(r);
-    real rr2 = rr1*rr1;
-    real rr3 = rr1*rr2;
-    real rr5 = 3*rr3*rr2;
-    real rr7 = 5*rr5*rr2;
-    real rr9 = 7*rr7*rr2;
-#ifdef F1
-    real rr11 = 9*rr9*rr2;
-#endif
-
-    real scale3 = 1;
-    real scale5 = 1;
-    real scale7 = 1;
-
-    real pdamp = atom1.damp*atom2.damp;
-    if (pdamp != 0) {
-   
-        real ratio = r/pdamp;
-        float pGamma = atom2.thole > atom1.thole ? atom1.thole : atom2.thole;
-
-        real damp = ratio*ratio*ratio*pGamma;
-        real dampExp = EXP(-damp);
-        real damp1 = damp + 1;
-        real damp2 = damp*damp;
-
-        scale3 = 1 - dampExp;
-        scale5 = 1 - damp1*dampExp;
-        scale7 = 1 - (damp1 + 0.6f*damp2)*dampExp;
-
-#ifdef F1
-        real factor = 3*damp*dampExp*rr2;
-        real factor7 = -0.2f + 0.6f*damp;
-        
-        ddsc3_0 = factor*xr;
-        ddsc5_0 = ddsc3_0*damp;
-        ddsc7_0 = ddsc5_0*factor7;
-
-        ddsc3_1 = factor*yr;
-        ddsc5_1 = ddsc3_1*damp;
-        ddsc7_1 = ddsc5_1*factor7;
-
-        ddsc3_2 = factor*zr;
-        ddsc5_2 = ddsc3_2*damp;
-        ddsc7_2 = ddsc5_2*factor7;
-#endif
-
-    }
-      
-#if defined F1
-    real scale3i = rr3*scale3*uScale;
-    real scale5i = rr5*scale5*uScale;
-#endif
-    real dsc3 = rr3*scale3*dScale;
-    real psc3 = rr3*scale3*pScale;
-
-    real dsc5 = rr5*scale5*dScale;
-    real psc5 = rr5*scale5*pScale;
-
-    real dsc7 = rr7*scale7*dScale;
-    real psc7 = rr7*scale7*pScale;
-
-#if defined F1
-    real sc2 = atom1.dipole.x*atom2.dipole.x + atom1.dipole.y*atom2.dipole.y + atom1.dipole.z*atom2.dipole.z;
-#endif
-#if defined F1 || defined T1
-    real sc4 = atom2.dipole.x*xr + atom2.dipole.y*yr + atom2.dipole.z*zr;
-#endif
-
-#if defined F1 || defined T3
-    real sc3 = atom1.dipole.x*xr + atom1.dipole.y*yr + atom1.dipole.z*zr;
-#endif
-    
-#if defined F1
-    real sci1 = atom1.inducedDipole.x*atom2.dipole.x + atom1.inducedDipole.y*atom2.dipole.y + atom1.inducedDipole.z*atom2.dipole.z +
-                atom2.inducedDipole.x*atom1.dipole.x + atom2.inducedDipole.y*atom1.dipole.y + atom2.inducedDipole.z*atom1.dipole.z;
-#endif
-        
-#if defined F1 || defined T3
-    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
-#endif
-#if defined F1 || defined T1
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-#endif
-    
-#if defined F1
-    real scip1 = atom1.inducedDipolePolar.x*atom2.dipole.x + atom1.inducedDipolePolar.y*atom2.dipole.y + atom1.inducedDipolePolar.z*atom2.dipole.z +
-                 atom2.inducedDipolePolar.x*atom1.dipole.x + atom2.inducedDipolePolar.y*atom1.dipole.y + atom2.inducedDipolePolar.z*atom1.dipole.z;
-
-    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x + atom1.inducedDipole.y*atom2.inducedDipolePolar.y + atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
-                 atom2.inducedDipole.x*atom1.inducedDipolePolar.x + atom2.inducedDipole.y*atom1.inducedDipolePolar.y + atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
-
-#endif
-    
-#if defined F1 || defined T3
-    real scip3 = ((atom1.inducedDipolePolar.x)*(xr) + (atom1.inducedDipolePolar.y)*(yr) + (atom1.inducedDipolePolar.z)*(zr));
-#endif
-#if defined F1 || defined T1
-    real scip4 = ((atom2.inducedDipolePolar.x)*(xr) + (atom2.inducedDipolePolar.y)*(yr) + (atom2.inducedDipolePolar.z)*(zr));
-#endif
-
-#ifdef F1
-    real gli1 = atom2.posq.w*sci3 - atom1.posq.w*sci4;
-    
-    real gli6 = sci1;
-    real glip1 = atom2.posq.w*scip3 - atom1.posq.w*scip4;
-    real glip6 = scip1;
-    real gli2 = -sc3*sci4 - sci3*sc4;
-    
-    real glip2 = -sc3*scip4 - scip3*sc4;
-    real factor3 = rr3*((gli1  +  gli6)*pScale + (glip1  + glip6)*dScale);
-    real factor5 = rr5*(gli2*pScale + glip2*dScale);
-    
-    real ftm2i_0 = -0.5f*(factor3*ddsc3_0 + factor5*ddsc5_0);
-    real ftm2i_1 = -0.5f*(factor3*ddsc3_1 + factor5*ddsc5_1);
-    real ftm2i_2 = -0.5f*(factor3*ddsc3_2 + factor5*ddsc5_2);
-      
-    real gl0 = atom1.posq.w*atom2.posq.w;
-    real gl1 = atom2.posq.w*sc3 - atom1.posq.w*sc4;
-    real gl2 = -sc3*sc4;
-    real gl6 = sc2;
-    
-    real gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*gl2;
-#endif
-#if defined F1 || defined T1
-    real gf2 = -atom2.posq.w*rr3 + sc4*rr5;
-    real gf5 = 2*(-atom2.posq.w*rr5+sc4*rr7);
-#endif
-#if defined F1 || defined T3
-    real gf3 =  atom1.posq.w*rr3 + sc3*rr5;
-    real gf6 = 2*(-atom1.posq.w*rr5-sc3*rr7);
-#endif
-
-#ifdef F1
-    real em = mScale*(rr1*gl0 + rr3*(gl1+gl6) + rr5*gl2);
-    real ei = 0.5f*((gli1+gli6)*psc3 + gli2*psc5);
-    energy = em+ei;
-#endif
-    
-#ifdef F1
-    real ftm2_0 = mScale*(gf1*xr + gf2*atom1.dipole.x + gf3*atom2.dipole.x);
-    real ftm2_1 = mScale*(gf1*yr + gf2*atom1.dipole.y + gf3*atom2.dipole.y);
-    real ftm2_2 = mScale*(gf1*zr + gf2*atom1.dipole.z + gf3*atom2.dipole.z);
-
-    real gfi1 = rr2*(1.5f*((gli1+gli6)*psc3 + (glip1+glip6)*dsc3 + scip2*scale3i) + 2.5f*(gli2*psc5 + glip2*dsc5 - (sci3*scip4+scip3*sci4)*scale5i));
-    ftm2i_0 += gfi1*xr;
-    ftm2i_1 += gfi1*yr;
-    ftm2i_2 += gfi1*zr;
-#endif
-
-#if defined F1 || defined T1
-    real gfi5 = (sci4*psc7 + scip4*dsc7);
-#endif
-#if defined F1 || defined T3
-    real gfi6 = -(sci3*psc7 + scip3*dsc7);
-#endif
-
-#ifdef F1
-    ftm2i_0 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.x*psc3 + atom1.inducedDipolePolar.x*dsc3) +
-               sc4*(atom1.inducedDipole.x*psc5 + atom1.inducedDipolePolar.x*dsc5)) +
-      
-               0.5f*(atom1.posq.w*(atom2.inducedDipole.x*psc3+atom2.inducedDipolePolar.x*dsc3) +
-               sc3*(atom2.inducedDipole.x*psc5 +atom2.inducedDipolePolar.x*dsc5)) +
-
-               scale5i*(sci4*atom1.inducedDipolePolar.x+scip4*atom1.inducedDipole.x +
-                        sci3*atom2.inducedDipolePolar.x+scip3*atom2.inducedDipole.x)*0.5f +
-      
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.x +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.x;
-      
-    ftm2i_1 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.y*psc3 + atom1.inducedDipolePolar.y*dsc3) +
-               sc4*(atom1.inducedDipole.y*psc5 + atom1.inducedDipolePolar.y*dsc5)) +
-
-               (atom1.posq.w*(atom2.inducedDipole.y*psc3+atom2.inducedDipolePolar.y*dsc3) +
-                    sc3*(atom2.inducedDipole.y*psc5+atom2.inducedDipolePolar.y*dsc5))*0.5f +
-                    scale5i*(sci4*atom1.inducedDipolePolar.y+scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y+scip3*atom2.inducedDipole.y)*0.5f +
-
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.y +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.y;
-      
-    ftm2i_2 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.z*psc3 + atom1.inducedDipolePolar.z*dsc3) +
-               sc4*(atom1.inducedDipole.z*psc5 + atom1.inducedDipolePolar.z*dsc5)) +
-
-               (atom1.posq.w*(atom2.inducedDipole.z*psc3+atom2.inducedDipolePolar.z*dsc3) +
-                    sc3*(atom2.inducedDipole.z*psc5+atom2.inducedDipolePolar.z*dsc5))*0.5f +
-                    scale5i*(sci4*atom1.inducedDipolePolar.z+scip4*atom1.inducedDipole.z +
-                    sci3*atom2.inducedDipolePolar.z+scip3*atom2.inducedDipole.z)*0.5f +
-
-               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.z +
-               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.z;
-
-#ifdef DIRECT_POLARIZATION
-    real gfd = 0.5*(3*rr2*scip2*scale3i - 5*rr2*(scip3*sci4+sci3*scip4)*scale5i);
-    real temp5 = 0.5*scale5i;
-    real fdir_0 = gfd*xr + temp5*(sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x + sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
-    real fdir_1 = gfd*yr + temp5*(sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
-    real fdir_2 = gfd*zr + temp5*(sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z + sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
-    ftm2i_0 -= fdir_0;
-    ftm2i_1 -= fdir_1;
-    ftm2i_2 -= fdir_2;
-#else
-    real scaleF = 0.5f*uScale;
-    real inducedFactor3 = scip2*rr3*scaleF;
-    real inducedFactor5 = (sci3*scip4+scip3*sci4)*rr5*scaleF;
-    real findmp_0 = inducedFactor3*ddsc3_0 - inducedFactor5*ddsc5_0;
-    real findmp_1 = inducedFactor3*ddsc3_1 - inducedFactor5*ddsc5_1;
-    real findmp_2 = inducedFactor3*ddsc3_2 - inducedFactor5*ddsc5_2;
-    ftm2i_0 -= findmp_0;
-    ftm2i_1 -= findmp_1;
-    ftm2i_2 -= findmp_2;
-#endif
-#endif
-
-#if defined T1
-    real gti2 = 0.5f*(sci4*psc5+scip4*dsc5);
-    real gti5 = gfi5;
-#endif
-#if defined T3
-    real gti3 = 0.5f*(sci3*psc5+scip3*dsc5);
-    real gti6 = gfi6;
-#endif
-
-#if defined T1 || defined T3
-    real dixdk_0 = atom1.dipole.y*atom2.dipole.z - atom1.dipole.z*atom2.dipole.y;
-    real dixdk_1 = atom1.dipole.z*atom2.dipole.x - atom1.dipole.x*atom2.dipole.z;
-    real dixdk_2 = atom1.dipole.x*atom2.dipole.y - atom1.dipole.y*atom2.dipole.x;
-
-#if defined T1
-    real dixuk_0 = atom1.dipole.y*atom2.inducedDipole.z - atom1.dipole.z*atom2.inducedDipole.y;
-    real dixuk_1 = atom1.dipole.z*atom2.inducedDipole.x - atom1.dipole.x*atom2.inducedDipole.z;
-    real dixuk_2 = atom1.dipole.x*atom2.inducedDipole.y - atom1.dipole.y*atom2.inducedDipole.x;
-#endif
-#endif
-
-#ifdef T1
-    real dixukp_0 = atom1.dipole.y*atom2.inducedDipolePolar.z - atom1.dipole.z*atom2.inducedDipolePolar.y;
-    real dixukp_1 = atom1.dipole.z*atom2.inducedDipolePolar.x - atom1.dipole.x*atom2.inducedDipolePolar.z;
-    real dixukp_2 = atom1.dipole.x*atom2.inducedDipolePolar.y - atom1.dipole.y*atom2.inducedDipolePolar.x;
-#endif
-
-#ifdef T1
-    real dixr_0 = atom1.dipole.y*zr - atom1.dipole.z*yr;
-    real dixr_1 = atom1.dipole.z*xr - atom1.dipole.x*zr;
-    real dixr_2 = atom1.dipole.x*yr - atom1.dipole.y*xr;
-#endif
-
-#ifdef T1
-    real ttm2_0 = -rr3*dixdk_0 + gf2*dixr_0;
-    real ttm2_1 = -rr3*dixdk_1 + gf2*dixr_1;
-    real ttm2_2 = -rr3*dixdk_2 + gf2*dixr_2;
-
-    real ttm2i_0 = -(dixuk_0*psc3+dixukp_0*dsc3)*0.5f + gti2*dixr_0;
-    real ttm2i_1 = -(dixuk_1*psc3+dixukp_1*dsc3)*0.5f + gti2*dixr_1;
-    real ttm2i_2 = -(dixuk_2*psc3+dixukp_2*dsc3)*0.5f + gti2*dixr_2;
-#endif
-
-#ifdef T3
-    real dkxr_0 = atom2.dipole.y*zr - atom2.dipole.z*yr;
-    real dkxr_1 = atom2.dipole.z*xr - atom2.dipole.x*zr;
-    real dkxr_2 = atom2.dipole.x*yr - atom2.dipole.y*xr;
-
-    real dkxui_0 = atom2.dipole.y*atom1.inducedDipole.z - atom2.dipole.z*atom1.inducedDipole.y;
-    real dkxui_1 = atom2.dipole.z*atom1.inducedDipole.x - atom2.dipole.x*atom1.inducedDipole.z; 
-    real dkxui_2 = atom2.dipole.x*atom1.inducedDipole.y - atom2.dipole.y*atom1.inducedDipole.x;
-
-    real dkxuip_0 = atom2.dipole.y*atom1.inducedDipolePolar.z - atom2.dipole.z*atom1.inducedDipolePolar.y;
-    real dkxuip_1 = atom2.dipole.z*atom1.inducedDipolePolar.x - atom2.dipole.x*atom1.inducedDipolePolar.z;
-    real dkxuip_2 = atom2.dipole.x*atom1.inducedDipolePolar.y - atom2.dipole.y*atom1.inducedDipolePolar.x;
-
-    real ttm3_0 =  rr3*dixdk_0 + gf3*dkxr_0;
-    real ttm3_1 =  rr3*dixdk_1 + gf3*dkxr_1;
-    real ttm3_2 =  rr3*dixdk_2 + gf3*dkxr_2;
-
-    real ttm3i_0 = -(dkxui_0*psc3+ dkxuip_0*dsc3)*0.5f + gti3*dkxr_0;
-    real ttm3i_1 = -(dkxui_1*psc3+ dkxuip_1*dsc3)*0.5f + gti3*dkxr_1;
-    real ttm3i_2 = -(dkxui_2*psc3+ dkxuip_2*dsc3)*0.5f + gti3*dkxr_2;
-#endif
-
-    if (mScale < 1) {
-#ifdef T1
-        ttm2_0 *= mScale;
-        ttm2_1 *= mScale;
-        ttm2_2 *= mScale;
-#endif
-        
-#ifdef T3
-        ttm3_0 *= mScale;
-        ttm3_1 *= mScale;
-        ttm3_2 *= mScale;
-#endif
-    }
-
-#ifdef F1
-    outputForce.x = -(ftm2_0+ftm2i_0);
-    outputForce.y = -(ftm2_1+ftm2i_1);
-    outputForce.z = -(ftm2_2+ftm2i_2);
-#endif
-    
-#ifdef T1
-    outputForce.x = (ttm2_0 + ttm2i_0);
-    outputForce.y = (ttm2_1 + ttm2i_1);
-    outputForce.z = (ttm2_2 + ttm2i_2);
-#endif
-
-#ifdef T3
-    outputForce.x = (ttm3_0 + ttm3i_0);
-    outputForce.y = (ttm3_1 + ttm3i_1);
-    outputForce.z = (ttm3_2 + ttm3i_2);
-#endif
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoles.cu
 extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4* __restrict__ multipoleParticles, float* __restrict__ molecularDipoles,
-        float* __restrict__ molecularQuadrupoles, real* __restrict__ labFrameDipoles, real* __restrict__ labFrameQuadrupoles) {
+        float* __restrict__ molecularQuadrupoles, real* __restrict__ labFrameDipoles, real* __restrict__ labFrameQuadrupoles,
+        real* __restrict__ sphericalDipoles, real* __restrict__ sphericalQuadrupoles) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += gridDim.x*blockDim.x) {
+        // Load the spherical multipoles.
+        
+        int offset = 3*atom;
+        sphericalDipoles[offset+0] = molecularDipoles[offset+2]; // z -> Q_10
+        sphericalDipoles[offset+1] = molecularDipoles[offset+0]; // x -> Q_11c
+        sphericalDipoles[offset+2] = molecularDipoles[offset+1]; // y -> Q_11s
+        offset = 5*atom;
+        sphericalQuadrupoles[offset+0] = -3.0f*(molecularQuadrupoles[offset+0]+molecularQuadrupoles[offset+3]); // zz -> Q_20
+        sphericalQuadrupoles[offset+1] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+2]; // xz -> Q_21c
+        sphericalQuadrupoles[offset+2] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+4]; // yz -> Q_21s
+        sphericalQuadrupoles[offset+3] = SQRT((real) 3)*(molecularQuadrupoles[offset+0]-molecularQuadrupoles[offset+3]); // xx-yy -> Q_22c
+        sphericalQuadrupoles[offset+4] = (2*SQRT((real) 3))*molecularQuadrupoles[offset+1]; // xy -> Q_22s
+        
        // get coordinates of this atom and the z & x axis atoms
        // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
        // this atom and the axis atom
@@ -8,7 +23,6 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4

        // code common to ZThenX and Bisector
        
-    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += gridDim.x*blockDim.x) {
        int4 particles = multipoleParticles[atom];
        if (particles.x >= 0 && particles.z >= 0) {
            real4 thisParticlePos = posq[atom];
@@ -149,7 +163,7 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
        
            // Transform the dipole
            
-            unsigned int offset = 3*atom;
+            offset = 3*atom;
            real molDipole[3];
            molDipole[0] = molecularDipoles[offset];
            molDipole[1] = molecularDipoles[offset+1];
@@ -192,6 +206,67 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
            labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.z*mPoleXX + vectorY.z*mPoleXY + vectorZ.z*mPoleXZ)
                                        + vectorY.y*(vectorX.z*mPoleXY + vectorY.z*mPoleYY + vectorZ.z*mPoleYZ)
                                        + vectorZ.y*(vectorX.z*mPoleXZ + vectorY.z*mPoleYZ + vectorZ.z*mPoleZZ);
+            
+            // ---------------------------------------------------------------------------------------
+            
+            // Now transform the spherical multipoles.  First do the dipoles.
+
+            offset = 3*atom;
+            real sphericalDipole[3];
+            sphericalDipole[0] = sphericalDipoles[offset];
+            sphericalDipole[1] = sphericalDipoles[offset+1];
+            sphericalDipole[2] = sphericalDipoles[offset+2];
+            if (reverse)
+                sphericalDipole[2] *= -1;
+            sphericalDipoles[offset] = sphericalDipole[0]*vectorZ.z + sphericalDipole[1]*vectorX.z + sphericalDipole[2]*vectorY.z;
+            sphericalDipoles[offset+1] = sphericalDipole[0]*vectorZ.x + sphericalDipole[1]*vectorX.x + sphericalDipole[2]*vectorY.x;
+            sphericalDipoles[offset+2] = sphericalDipole[0]*vectorZ.y + sphericalDipole[1]*vectorX.y + sphericalDipole[2]*vectorY.y;
+            
+            // Now the quadrupoles.
+
+            offset = 5*atom;
+            real sphericalQuadrupole[5];
+            sphericalQuadrupole[0] = sphericalQuadrupoles[offset];
+            sphericalQuadrupole[1] = sphericalQuadrupoles[offset+1];
+            sphericalQuadrupole[2] = sphericalQuadrupoles[offset+2];
+            sphericalQuadrupole[3] = sphericalQuadrupoles[offset+3];
+            sphericalQuadrupole[4] = sphericalQuadrupoles[offset+4];
+            if (reverse) {
+                sphericalQuadrupole[2] *= -1;
+                sphericalQuadrupole[4] *= -1;
+            }
+            real rotatedQuadrupole[5] = {0, 0, 0, 0, 0};
+            real sqrtThree = SQRT((real) 3);
+            rotatedQuadrupole[0] += sphericalQuadrupole[0]*0.5f*(3.0f*vectorZ.z*vectorZ.z - 1.0f) +
+                                    sphericalQuadrupole[1]*sqrtThree*vectorZ.z*vectorX.z +
+                                    sphericalQuadrupole[2]*sqrtThree*vectorZ.z*vectorY.z +
+                                    sphericalQuadrupole[3]*0.5f*sqrtThree*(vectorX.z*vectorX.z - vectorY.z*vectorY.z) +
+                                    sphericalQuadrupole[4]*sqrtThree*vectorX.z*vectorY.z;
+            rotatedQuadrupole[1] += sphericalQuadrupole[0]*sqrtThree*vectorZ.z*vectorZ.x +
+                                    sphericalQuadrupole[1]*(vectorZ.x*vectorX.z + vectorZ.z*vectorX.x) +
+                                    sphericalQuadrupole[2]*(vectorZ.x*vectorY.z + vectorZ.z*vectorY.x) +
+                                    sphericalQuadrupole[3]*(vectorX.z*vectorX.x - vectorY.z*vectorY.x) +
+                                    sphericalQuadrupole[4]*(vectorX.x*vectorY.z + vectorX.z*vectorY.x);
+            rotatedQuadrupole[2] += sphericalQuadrupole[0]*sqrtThree*vectorZ.z*vectorZ.y +
+                                    sphericalQuadrupole[1]*(vectorZ.y*vectorX.z + vectorZ.z*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.y*vectorY.z + vectorZ.z*vectorY.y) +
+                                    sphericalQuadrupole[3]*(vectorX.z*vectorX.y - vectorY.z*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.y*vectorY.z + vectorX.z*vectorY.y);
+            rotatedQuadrupole[3] += sphericalQuadrupole[0]*0.5f*sqrtThree*(vectorZ.x*vectorZ.x - vectorZ.y*vectorZ.y) +
+                                    sphericalQuadrupole[1]*(vectorZ.x*vectorX.x - vectorZ.y*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.x*vectorY.x - vectorZ.y*vectorY.y) +
+                                    sphericalQuadrupole[3]*0.5f*(vectorX.x*vectorX.x - vectorX.y*vectorX.y - vectorY.x*vectorY.x + vectorY.y*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.x*vectorY.x - vectorX.y*vectorY.y);
+            rotatedQuadrupole[4] += sphericalQuadrupole[0]*sqrtThree*vectorZ.x*vectorZ.y +
+                                    sphericalQuadrupole[1]*(vectorZ.y*vectorX.x + vectorZ.x*vectorX.y) +
+                                    sphericalQuadrupole[2]*(vectorZ.y*vectorY.x + vectorZ.x*vectorY.y) +
+                                    sphericalQuadrupole[3]*(vectorX.x*vectorX.y - vectorY.x*vectorY.y) +
+                                    sphericalQuadrupole[4]*(vectorX.y*vectorY.x + vectorX.x*vectorY.y);
+            sphericalQuadrupoles[offset] = rotatedQuadrupole[0];
+            sphericalQuadrupoles[offset+1] = rotatedQuadrupole[1];
+            sphericalQuadrupoles[offset+2] = rotatedQuadrupole[2];
+            sphericalQuadrupoles[offset+3] = rotatedQuadrupole[3];
+            sphericalQuadrupoles[offset+4] = rotatedQuadrupole[4];
        }
        else {
            labFrameDipoles[3*atom] = molecularDipoles[3*atom];

--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForce.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForceNoQuadrupoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForceNoQuadrupoles.cu
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionF1(
-#else
-computeOneInteractionF1NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor,
-#ifdef APPLY_SCALE
-        float dScale, float pScale, float mScale,
-#endif
-        real3& force, real& energy) {
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-#ifdef APPLY_SCALE
-    real rr1 = delta.w;
-#endif
-
-    // set the permanent multipole and induced dipole values;
-
-    real ci = atom1.q;
-
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-
-    real ck = atom2.q;
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-
-#ifdef APPLY_SCALE
-    real offset = 1-mScale;
-    real rr3 = rr1*rr1*rr1;
-    real gf4 = 2*(bn2 - 3*offset*rr3*rr1*rr1);
-#else
-    real gf4 = 2*bn2;
-#endif
-    real ftm21 = 0;
-    real ftm22 = 0;
-    real ftm23 = 0;
-
-    // calculate the scalar products for permanent components
-
-    real gl6 = di1*dk1 + di2*dk2 + di3*dk3;
-    real sc3 = di1*xr + di2*yr + di3*zr;
-    real sc4 = dk1*xr + dk2*yr + dk3*zr;
-
-    real gl0 = ci*ck;
-    real gl1 = ck*sc3 - ci*sc4;
-    real gl2 = -sc3*sc4;
-
-#ifdef APPLY_SCALE
-    energy += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3*rr3*rr1*rr1))*gl2);
-#else
-    energy += forceFactor*(bn1*(gl1+gl6) + bn2*gl2);
-    
-#endif
-
-    real gf1 = bn1*gl0 + bn2*(gl1+gl6) + bn3*gl2;
-#ifdef APPLY_SCALE
-    gf1 -= offset*(rr3*gl0 + (3*rr3*rr1*rr1)*(gl1+gl6) + (15*rr3*rr3*rr1)*gl2);
-#endif
-    ftm21 += gf1*xr;
-    ftm22 += gf1*yr;
-    ftm23 += gf1*zr;
-
-#ifdef APPLY_SCALE
-    real gf2 = -ck*bn1 + sc4*bn2 - offset*(-ck*rr3 + sc4*(3*rr3*rr1*rr1));
-#else
-    real gf2 = -ck*bn1 + sc4*bn2;
-#endif
-    ftm21 += gf2*di1;
-    ftm22 += gf2*di2;
-    ftm23 += gf2*di3;
-
-#ifdef APPLY_SCALE
-    real gf3 = ci*bn1 + sc3*bn2 - offset*(ci*rr3 + sc3*(3*rr3*rr1*rr1));
-#else
-    real gf3 = ci*bn1 + sc3*bn2;
-#endif
-    ftm21 += gf3*dk1;
-    ftm22 += gf3*dk2;
-    ftm23 += gf3*dk3;
-
-    force.x = ftm21;
-    force.y = ftm22;
-    force.z = ftm23;
-}
-
-
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionF2(
-#else
-computeOneInteractionF2NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor,
-#ifdef APPLY_SCALE
-        float dScale, float pScale, float mScale,
-#endif
-        real3& force, real& energy) {
-    const float uScale = 1;
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-    real rr1 = delta.w;
-
-    // set the permanent multipole and induced dipole values;
-
-    real ci = atom1.q;
-
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-
-    real damp = atom1.damp*atom2.damp;
-    if (damp != 0) {
-        real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
-        real ratio = RECIP(rr1*damp);
-        damp = -pgamma*ratio*ratio*ratio;
-    }
-
-    real scale5 = (damp == 0) ? 1 : (1 - (1-damp)*EXP(damp));
-    real rr5 = rr1*rr1;
-          rr5 = 3*rr1*rr5*rr5;
-#ifdef APPLY_SCALE
-    real psc5 = rr5*(1 - scale5*pScale);
-    real dsc5 = rr5*(1 - scale5*dScale);
-    real usc5 = rr5*(1 - scale5*uScale);
-#else
-    real psc5 = rr5*(1 - scale5);
-#endif
-
-    real ftm21 = 0;
-    real ftm22 = 0;
-    real ftm23 = 0;
-
-    real expdamp = EXP(damp);
-    real scale3 = (damp == 0) ? 1 : (1 - expdamp);
-    real rr3 = rr1*rr1*rr1;
-
-#ifdef APPLY_SCALE
-    real psc3 = rr3*(1 - scale3*pScale);
-    real dsc3 = rr3*(1 - scale3*dScale);
-    real usc3 = rr3*(1 - scale3*uScale);
-#else
-    real psc3 = rr3*(1 - scale3);
-#endif
-
-    real scale7 = (damp == 0) ? 1 : (1 - (1-damp+0.6f*damp*damp)*expdamp);
-
-#ifdef APPLY_SCALE
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7*pScale);
-    real dsc7 = (15*rr3*rr3*rr1)*(1 - scale7*dScale);
-#else
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7);
-#endif
-
-    real sc3 = di1*xr + di2*yr + di3*zr;
-    real gfi3 = ci*bn1 + sc3*bn2;
-
-    real prefactor1;
-    prefactor1 = 0.5f*(ci*psc3 + sc3*psc5 - gfi3);
-    ftm21 -= prefactor1*atom2.inducedDipole.x;
-    ftm22 -= prefactor1*atom2.inducedDipole.y;
-    ftm23 -= prefactor1*atom2.inducedDipole.z;
-
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(ci*dsc3 + sc3*dsc5 - gfi3);
-#endif
-    ftm21 -= prefactor1*atom2.inducedDipolePolar.x;
-    ftm22 -= prefactor1*atom2.inducedDipolePolar.y;
-    ftm23 -= prefactor1*atom2.inducedDipolePolar.z;
-
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    energy += forceFactor*0.5f*sci4*((psc3-bn1)*ci + (psc5-bn2)*sc3);
-
-    real scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2 - usc5);
-#else
-    prefactor1 = 0.5f*(bn2 - psc5);
-#endif
-    ftm21 += prefactor1*((sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x));
-    ftm22 += prefactor1*((sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y));
-    ftm23 += prefactor1*((sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z));
-#endif
-
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2*(sci4+scip4) - (sci4*psc5+scip4*dsc5)); 
-#else
-    sci4 += scip4;
-    prefactor1 = 0.5f*sci4*(bn2 - psc5); 
-#endif
-
-    ftm21 += prefactor1*di1;
-    ftm22 += prefactor1*di2;
-    ftm23 += prefactor1*di3;
-
-#ifdef APPLY_SCALE
-    real gli1 = -ci*sci4;
-    real gli2 = -sc3*sci4;
-    real glip1 = -ci*scip4;
-    real glip2 = -sc3*scip4;
-#else
-    real gli1 = -ci*sci4;
-    real gli2 = -sc3*sci4;
-#endif
-
-#ifdef APPLY_SCALE
-    real gfi1 = (bn2*(gli1+glip1) + bn3*(gli2+glip2));
-    gfi1 -= (rr1*rr1)*(3*(gli1*psc3 + glip1*dsc3) + 5*(gli2*psc5 + glip2*dsc5));
-#else
-    real gfi1 = bn2*gli1 + bn3*gli2;
-    gfi1 -= (rr1*rr1)*(3*gli1*psc3 + 5*gli2*psc5);
-#endif
-    gfi1 *= 0.5f;
-    ftm21 += gfi1*xr;
-    ftm22 += gfi1*yr;
-    ftm23 += gfi1*zr;
-
-    {
-        real expdamp = EXP(damp);
-        real temp3 = -1.5f*damp*expdamp*rr1*rr1;
-        real temp5 = -damp;
-        real temp7 = -0.2f - 0.6f*damp;
-
-        real ddsc31 = temp3*xr;
-        real ddsc32 = temp3*yr;
-        real ddsc33 = temp3*zr;
-
-        real ddsc51 = temp5*ddsc31;
-        real ddsc52 = temp5*ddsc32;
-        real ddsc53 = temp5*ddsc33;
-
-        real ddsc71 = temp7*ddsc51;
-        real ddsc72 = temp7*ddsc52;
-        real ddsc73 = temp7*ddsc53;
-
-        real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-        temp3 = (gli1*pScale + glip1*dScale);
-        temp5 = (3*rr1*rr1)*(gli2*pScale + glip2*dScale);
-#else
-        temp3 = gli1;
-        temp5 = (3*rr1*rr1)*gli2;
-#endif
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
-    }
-
-//K
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-
-    real sc4 =  dk1*xr +  dk2*yr +  dk3*zr;
-
-    real ck = atom2.q;
-    real gfi2 = (-ck*bn1 + sc4*bn2);
-
-    prefactor1 = 0.5f*(ck*psc3 - sc4*psc5 + gfi2);
-    ftm21 += prefactor1*atom1.inducedDipole.x;
-    ftm22 += prefactor1*atom1.inducedDipole.y;
-    ftm23 += prefactor1*atom1.inducedDipole.z;
-
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(ck*dsc3 - sc4*dsc5 + gfi2);
-#endif
-    ftm21 += prefactor1*atom1.inducedDipolePolar.x;
-    ftm22 += prefactor1*atom1.inducedDipolePolar.y;
-    ftm23 += prefactor1*atom1.inducedDipolePolar.z;
-
-    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
-    energy += forceFactor*0.5f*sci3*(ck*(bn1-psc3) - sc4*(bn2-psc5));
-    real scip3 = atom1.inducedDipolePolar.x*xr + atom1.inducedDipolePolar.y*yr + atom1.inducedDipolePolar.z*zr;
-
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2 - usc5);
-#else
-    prefactor1 = 0.5f*(bn2 - psc5);
-#endif
-
-    ftm21 += prefactor1*(sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
-    ftm22 += prefactor1*(sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
-    ftm23 += prefactor1*(sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
-    
-    real sci34;
-    sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-    sci34 = (sci3*scip4+scip3*sci4);
-
-#ifdef APPLY_SCALE
-    gfi1 = sci34*(usc5*(5*rr1*rr1) -bn3);
-#else
-    gfi1 = sci34*(psc5*(5*rr1*rr1) -bn3);
-#endif
-#else
-    gfi1 = 0;
-#endif
-    
-#ifdef APPLY_SCALE
-    prefactor1 = 0.5f*(bn2*(sci3+scip3) - (sci3*psc5+scip3*dsc5));
-#else
-    sci3 += scip3;
-    prefactor1 = 0.5f*sci3*(bn2 - psc5);
-#endif
-    ftm21 += prefactor1*dk1;
-    ftm22 += prefactor1*dk2;
-    ftm23 += prefactor1*dk3;
-
-#ifdef APPLY_SCALE
-    real gfi6 = -bn3*(sci3+scip3) + (sci3*psc7+scip3*dsc7);
-#else
-    real gfi6 = sci3*(psc7 - bn3);
-#endif
-
-    real sci1 = atom1.inducedDipole.x*dk1 + atom1.inducedDipole.y*dk2 + atom1.inducedDipole.z*dk3 + di1*atom2.inducedDipole.x + di2*atom2.inducedDipole.y + di3*atom2.inducedDipole.z;
-    energy += forceFactor*0.5f*(sci1*(bn1-psc3));
-
-    real scip1 = atom1.inducedDipolePolar.x*dk1 + atom1.inducedDipolePolar.y*dk2 + atom1.inducedDipolePolar.z*dk3 + di1*atom2.inducedDipolePolar.x + di2*atom2.inducedDipolePolar.y + di3*atom2.inducedDipolePolar.z;
-#ifndef APPLY_SCALE
-        sci1 += scip1;
-#endif
-
-    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x +
-                                  atom1.inducedDipole.y*atom2.inducedDipolePolar.y +
-                                  atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
-                                  atom2.inducedDipole.x*atom1.inducedDipolePolar.x +
-                                  atom2.inducedDipole.y*atom1.inducedDipolePolar.y +
-                                  atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
-
-           gli1 = ck*sci3 + sci1;
-           gli2 = -sci3*sc4;
-#ifdef APPLY_SCALE
-          glip1 = ck*scip3 + scip1;
-          glip2 = -scip3*sc4;
-#endif
-
-
-#ifdef APPLY_SCALE
-    gfi1 += (bn2*(gli1+glip1) + bn3*(gli2+glip2));
-    gfi1 -= (rr1*rr1)*(3*(gli1*psc3 + glip1*dsc3) + 5*(gli2*psc5 + glip2*dsc5));
-#else
-    gfi1 += (bn2*gli1 + bn3*gli2);
-    gfi1 -= (rr1*rr1)*(3*gli1*psc3 + 5*gli2*psc5);
-#endif
-    
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-    gfi1 += scip2*(bn2 - (3*rr1*rr1)*usc3);
-#else
-    gfi1 += scip2*(bn2 - (3*rr1*rr1)*psc3);
-#endif
-#endif
-    
-    gfi1 *= 0.5f;
-
-    ftm21 += gfi1*xr;
-    ftm22 += gfi1*yr;
-    ftm23 += gfi1*zr;
-
-    {
-        real expdamp = EXP(damp);
-        real temp3 = -1.5f*damp*expdamp*rr1*rr1;
-        real temp5 = -damp;
-        real temp7 = -0.2f - 0.6f*damp;
-
-        real ddsc31 = temp3*xr;
-        real ddsc32 = temp3*yr;
-        real ddsc33 = temp3*zr;
-
-        real ddsc51 = temp5*ddsc31;
-        real ddsc52 = temp5*ddsc32;
-        real ddsc53 = temp5*ddsc33;
-
-        real ddsc71 = temp7*ddsc51;
-        real ddsc72 = temp7*ddsc52;
-        real ddsc73 = temp7*ddsc53;
-
-        real rr3 = rr1*rr1*rr1;
-
-#ifdef APPLY_SCALE
-        temp3 = gli1*pScale + glip1*dScale;
-        temp5 = (3*rr1*rr1)*(gli2*pScale + glip2*dScale);
-#else
-        temp3 = gli1;
-        temp5 = (3*rr1*rr1)*gli2;
-#endif
-
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
-
-#ifndef DIRECT_POLARIZATION
-#ifdef APPLY_SCALE
-        temp3 =  uScale*scip2;
-        temp5 = -(3*rr1*rr1)*uScale*sci34;
-#else
-        temp3 =  scip2;
-        temp5 = -(3*rr1*rr1)*sci34;
-#endif
-        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
-        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
-        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
-#endif
-    }
-
-    force.x += ftm21;
-    force.y += ftm22;
-    force.z += ftm23;
-}
-
-
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionT1(
-#else
-computeOneInteractionT1NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
-#ifdef APPLY_SCALE
-        , float dScale, float pScale, float mScale
-#endif
-        ) {
-
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-#ifdef APPLY_SCALE
-    real rr1 = delta.w;
-#endif
-
-    // set the permanent multipole and induced dipole values;
-
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-
-    real ck = atom2.q;
-
-    real dk1 = atom2.dipole.x;
-    real dk2 = atom2.dipole.y;
-    real dk3 = atom2.dipole.z;
-
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-    real bn4 = bn.w;
-
-    // apply Thole polarization damping to scale factors
-
-#ifdef APPLY_SCALE
-    real rr2 = rr1*rr1;
-    real rr3 = rr1*rr2;
-    real rr5 = 3*rr3*rr2;
-    real rr7 = 5*rr5*rr2;
-    real rr9 = 7*rr7*rr2;
-
-    real scale = 1-mScale;
-    real prefactor = scale*rr3 - bn1;
-#else
-    real prefactor = -bn1;
-#endif
-    real dixdk1 = di2*dk3 - di3*dk2;
-    real ttm21 = prefactor*dixdk1;
-
-    real dixdk2 = di3*dk1 - di1*dk3;
-    real ttm22 = prefactor*dixdk2;
-
-    real dixdk3 = di1*dk2 - di2*dk1;
-    real ttm23 = prefactor*dixdk3;
-
-    real sc4 = dk1*xr + dk2*yr + dk3*zr;
-    real sc6 = 0;
-
-    real gf2 = -ck*bn1 + sc4*bn2;
-#ifdef APPLY_SCALE
-    real gfr2 = -ck*rr3 + sc4*rr5;
-    prefactor = (gf2 - scale*gfr2);
-#else
-    prefactor = gf2;
-#endif
-    ttm21 += prefactor*(di2*zr - di3*yr);
-    ttm22 += prefactor*(di3*xr - di1*zr);
-    ttm23 += prefactor*(di1*yr - di2*xr);
-
-    atom1.torque.x += ttm21;
-    atom1.torque.y += ttm22;
-    atom1.torque.z += ttm23;
-}
-
-
-__device__ void
-#ifdef APPLY_SCALE
-computeOneInteractionT2(
-#else
-computeOneInteractionT2NoScale(
-#endif
-        AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
-#ifdef APPLY_SCALE
-        , float dScale, float pScale, float mScale
-#endif
-        ) {
-
-    real xr = delta.x;
-    real yr = delta.y;
-    real zr = delta.z;
-    real rr1 = delta.w;
-
-    // set the permanent multipole and induced dipole values;
-
-    real di1 = atom1.dipole.x;
-    real di2 = atom1.dipole.y;
-    real di3 = atom1.dipole.z;
-
-    real bn1 = bn.x;
-    real bn2 = bn.y;
-    real bn3 = bn.z;
-
-    // apply Thole polarization damping to scale factors
-
-    real scale3 = 1;
-    real scale5 = 1;
-    real scale7 = 1;
-
-    real damp = atom1.damp*atom2.damp;
-    if (damp != 0) {
-        real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
-        real ratio = RECIP(rr1*damp);
-        damp = -pgamma*ratio*ratio*ratio;
-        real expdamp = EXP(damp);
-        scale3 = 1 - expdamp;
-        scale5 = 1 - (1-damp)*expdamp;
-        scale7 = 1 - (1-damp+0.6f*damp*damp)*expdamp;
-    }
-
-    real rr3 = rr1*rr1*rr1;
-#ifdef APPLY_SCALE
-    real dsc3 = rr3*(1 - scale3*dScale);
-    real dsc5 = (3*rr3*rr1*rr1)* (1 - scale5*dScale);
-    real dsc7 = (15*rr3*rr3*rr1)*(1 - scale7*dScale);
-
-    real psc3 = rr3*(1 - scale3*pScale);
-    real psc5 = (3*rr3*rr1*rr1)*(1 - scale5*pScale);
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7*pScale);
-#else
-    real psc3 = rr3*(1 - scale3);
-    real psc5 = (3*rr3*rr1*rr1)*(1 - scale5);
-    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7);
-#endif
-
-    real prefactor1 = 0.5f*(psc3 - bn1);
-#ifdef APPLY_SCALE
-    real prefactor2 = 0.5f*(dsc3 - bn1);
-#endif
-
-    real dixuk1 = di2*atom2.inducedDipole.z - di3*atom2.inducedDipole.y;
-    real dixukp1 = di2*atom2.inducedDipolePolar.z - di3*atom2.inducedDipolePolar.y;
-
-#ifdef APPLY_SCALE
-    real ttm2i1 = prefactor1*dixuk1 + prefactor2*dixukp1;
-#else
-    real ttm2i1 = prefactor1*(dixuk1 + dixukp1);
-#endif
-
-    real dixuk2 = di3*atom2.inducedDipole.x - di1*atom2.inducedDipole.z;
-    real dixukp2 = di3*atom2.inducedDipolePolar.x - di1*atom2.inducedDipolePolar.z;
-
-#ifdef APPLY_SCALE
-    real ttm2i2 = prefactor1*dixuk2 + prefactor2*dixukp2;
-#else
-    real ttm2i2 = prefactor1*(dixuk2 + dixukp2);
-#endif
-
-    real dixuk3 = di1*atom2.inducedDipole.y - di2*atom2.inducedDipole.x;
-    real dixukp3 = di1*atom2.inducedDipolePolar.y - di2*atom2.inducedDipolePolar.x;
-#ifdef APPLY_SCALE
-    real ttm2i3 = prefactor1*dixuk3 + prefactor2*dixukp3;
-#else
-    real ttm2i3 = prefactor1*(dixuk3 + dixukp3);
-#endif
-
-    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
-    real scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
-    real gti2 = bn2*(sci4+scip4);
-#ifdef APPLY_SCALE
-    real gtri2 = (sci4*psc5+scip4*dsc5);
-#else
-    real gtri2 = psc5*(sci4+scip4);
-#endif
-    prefactor1 = 0.5f*(gti2 - gtri2);
-
-    ttm2i1 += prefactor1*(di2*zr - di3*yr);
-    ttm2i2 += prefactor1*(di3*xr - di1*zr);
-    ttm2i3 += prefactor1*(di1*yr - di2*xr);
-
-    atom1.torque.x += ttm2i1;
-    atom1.torque.y += ttm2i2;
-    atom1.torque.z += ttm2i3;
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu