Created optimized version of AmoebaMultipoleForce kernels when all quadrupoles are 0

6ddabda8 · peastman · e4beb290 · 6ddabda8 · 6ddabda8 · 6ddabda8
Commit 6ddabda8 authored Jul 09, 2013 by peastman
8 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -920,6 +920,10 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        molecularQuadrupolesVec.push_back((float) quadrupole[4]);
        molecularQuadrupolesVec.push_back((float) quadrupole[5]);
    }
+    hasQuadrupoles = false;
+    for (int i = 0; i < (int) molecularQuadrupolesVec.size(); i++)
+        if (molecularQuadrupolesVec[i] != 0.0)
+            hasQuadrupoles = true;
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    for (int i = numMultipoles; i < paddedNumAtoms; i++) {
        dampingAndTholeVec.push_back(make_float2(0, 0));
@@ -1039,6 +1043,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        defines["DIRECT_POLARIZATION"] = "";
    if (useShuffle)
        defines["USE_SHUFFLE"] = "";
+    if (hasQuadrupoles)
+        defines["INCLUDE_QUADRUPOLES"] = "";
    defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
    int numExclusionTiles = tilesWithExclusions.size();
    defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
@@ -1103,9 +1109,9 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    if (usePME) {
        electrostaticsSource << CudaKernelSources::vectorOps;
        electrostaticsSource << CudaAmoebaKernelSources::pmeMultipoleElectrostatics;
-        electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce;
+        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::pmeElectrostaticPairForce : CudaAmoebaKernelSources::pmeElectrostaticPairForceNoQuadrupoles);
        electrostaticsSource << "#define APPLY_SCALE\n";
-        electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce;
+        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::pmeElectrostaticPairForce : CudaAmoebaKernelSources::pmeElectrostaticPairForceNoQuadrupoles);
        electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
        if (!useShuffle)
            electrostaticsThreadMemory += 3*elementSize;
@@ -1114,13 +1120,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        electrostaticsSource << CudaKernelSources::vectorOps;
        electrostaticsSource << CudaAmoebaKernelSources::multipoleElectrostatics;
        electrostaticsSource << "#define F1\n";
-        electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
        electrostaticsSource << "#undef F1\n";
        electrostaticsSource << "#define T1\n";
-        electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
        electrostaticsSource << "#undef T1\n";
        electrostaticsSource << "#define T3\n";
-        electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+        electrostaticsSource << (hasQuadrupoles ? CudaAmoebaKernelSources::electrostaticPairForce : CudaAmoebaKernelSources::electrostaticPairForceNoQuadrupoles);
        electrostaticsThreadMemory = 21*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
        if (!useShuffle)
            electrostaticsThreadMemory += 3*elementSize;
@@ -1832,6 +1838,11 @@ void CudaCalcAmoebaMultipoleForceKernel::copyParametersToContext(ContextImpl& co
        molecularQuadrupolesVec.push_back((float) quadrupole[4]);
        molecularQuadrupolesVec.push_back((float) quadrupole[5]);
    }
+    if (!hasQuadrupoles) {
+        for (int i = 0; i < (int) molecularQuadrupolesVec.size(); i++)
+            if (molecularQuadrupolesVec[i] != 0.0)
+                throw OpenMMException("updateParametersInContext: Cannot set a non-zero quadrupole moment, because quadrupoles were excluded from the kernel");
+    }
    for (int i = force.getNumMultipoles(); i < cu.getPaddedNumAtoms(); i++) {
        dampingAndTholeVec.push_back(make_float2(0, 0));
        polarizabilityVec.push_back(0);

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -373,7 +373,7 @@ private:
    int numMultipoles, maxInducedIterations;
    int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
    double inducedEpsilon;
-    bool hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid;
+    bool hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid;
    CudaContext& cu;
    const System& system;
    std::vector<int3> covalentFlagValues;

--- a/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForceNoQuadrupoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/electrostaticPairForceNoQuadrupoles.cu
+/**
+ * This defines three different closely related functions, depending on which constant (F1, T1, or T3) is defined.
+ */
+#if defined F1
+__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real& energy, real3& outputForce) {
+#elif defined T1
+__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
+#else
+__device__ void computeOneInteractionT3(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
+#endif
+#ifdef F1
+    const float uScale = 1;
+    real ddsc3_0 = 0;
+    real ddsc3_1 = 0;
+    real ddsc3_2 = 0;
+    real ddsc5_0 = 0;
+    real ddsc5_1 = 0;
+    real ddsc5_2 = 0;
+    real ddsc7_0 = 0;
+    real ddsc7_1 = 0;
+    real ddsc7_2 = 0;
+#endif
+    real xr = atom2.posq.x - atom1.posq.x;
+    real yr = atom2.posq.y - atom1.posq.y;
+    real zr = atom2.posq.z - atom1.posq.z;
+    real r2 = xr*xr + yr*yr + zr*zr;
+    real r = SQRT(r2);
+    real rr1 = RECIP(r);
+    real rr2 = rr1*rr1;
+    real rr3 = rr1*rr2;
+    real rr5 = 3*rr3*rr2;
+    real rr7 = 5*rr5*rr2;
+    real rr9 = 7*rr7*rr2;
+#ifdef F1
+    real rr11 = 9*rr9*rr2;
+#endif
+    real scale3 = 1;
+    real scale5 = 1;
+    real scale7 = 1;
+    real pdamp = atom1.damp*atom2.damp;
+    if (pdamp != 0) {
+        real ratio = r/pdamp;
+        float pGamma = atom2.thole > atom1.thole ? atom1.thole : atom2.thole;
+        real damp = ratio*ratio*ratio*pGamma;
+        real dampExp = EXP(-damp);
+        real damp1 = damp + 1;
+        real damp2 = damp*damp;
+        scale3 = 1 - dampExp;
+        scale5 = 1 - damp1*dampExp;
+        scale7 = 1 - (damp1 + 0.6f*damp2)*dampExp;
+#ifdef F1
+        real factor = 3*damp*dampExp*rr2;
+        real factor7 = -0.2f + 0.6f*damp;
+        ddsc3_0 = factor*xr;
+        ddsc5_0 = ddsc3_0*damp;
+        ddsc7_0 = ddsc5_0*factor7;
+        ddsc3_1 = factor*yr;
+        ddsc5_1 = ddsc3_1*damp;
+        ddsc7_1 = ddsc5_1*factor7;
+        ddsc3_2 = factor*zr;
+        ddsc5_2 = ddsc3_2*damp;
+        ddsc7_2 = ddsc5_2*factor7;
+#endif
+    }
+#if defined F1
+    real scale3i = rr3*scale3*uScale;
+    real scale5i = rr5*scale5*uScale;
+#endif
+    real dsc3 = rr3*scale3*dScale;
+    real psc3 = rr3*scale3*pScale;
+    real dsc5 = rr5*scale5*dScale;
+    real psc5 = rr5*scale5*pScale;
+    real dsc7 = rr7*scale7*dScale;
+    real psc7 = rr7*scale7*pScale;
+#if defined F1
+    real sc2 = atom1.dipole.x*atom2.dipole.x + atom1.dipole.y*atom2.dipole.y + atom1.dipole.z*atom2.dipole.z;
+#endif
+#if defined F1 || defined T1
+    real sc4 = atom2.dipole.x*xr + atom2.dipole.y*yr + atom2.dipole.z*zr;
+#endif
+#if defined F1 || defined T3
+    real sc3 = atom1.dipole.x*xr + atom1.dipole.y*yr + atom1.dipole.z*zr;
+#endif
+#if defined F1
+    real sci1 = atom1.inducedDipole.x*atom2.dipole.x + atom1.inducedDipole.y*atom2.dipole.y + atom1.inducedDipole.z*atom2.dipole.z +
+                atom2.inducedDipole.x*atom1.dipole.x + atom2.inducedDipole.y*atom1.dipole.y + atom2.inducedDipole.z*atom1.dipole.z;
+#endif
+#if defined F1 || defined T3
+    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
+#endif
+#if defined F1 || defined T1
+    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
+#endif
+#if defined F1
+    real scip1 = atom1.inducedDipolePolar.x*atom2.dipole.x + atom1.inducedDipolePolar.y*atom2.dipole.y + atom1.inducedDipolePolar.z*atom2.dipole.z +
+                 atom2.inducedDipolePolar.x*atom1.dipole.x + atom2.inducedDipolePolar.y*atom1.dipole.y + atom2.inducedDipolePolar.z*atom1.dipole.z;
+    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x + atom1.inducedDipole.y*atom2.inducedDipolePolar.y + atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
+                 atom2.inducedDipole.x*atom1.inducedDipolePolar.x + atom2.inducedDipole.y*atom1.inducedDipolePolar.y + atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
+#endif
+#if defined F1 || defined T3
+    real scip3 = ((atom1.inducedDipolePolar.x)*(xr) + (atom1.inducedDipolePolar.y)*(yr) + (atom1.inducedDipolePolar.z)*(zr));
+#endif
+#if defined F1 || defined T1
+    real scip4 = ((atom2.inducedDipolePolar.x)*(xr) + (atom2.inducedDipolePolar.y)*(yr) + (atom2.inducedDipolePolar.z)*(zr));
+#endif
+#ifdef F1
+    real gli1 = atom2.posq.w*sci3 - atom1.posq.w*sci4;
+    real gli6 = sci1;
+    real glip1 = atom2.posq.w*scip3 - atom1.posq.w*scip4;
+    real glip6 = scip1;
+    real gli2 = -sc3*sci4 - sci3*sc4;
+    real glip2 = -sc3*scip4 - scip3*sc4;
+    real factor3 = rr3*((gli1  +  gli6)*pScale + (glip1  + glip6)*dScale);
+    real factor5 = rr5*(gli2*pScale + glip2*dScale);
+    real ftm2i_0 = -0.5f*(factor3*ddsc3_0 + factor5*ddsc5_0);
+    real ftm2i_1 = -0.5f*(factor3*ddsc3_1 + factor5*ddsc5_1);
+    real ftm2i_2 = -0.5f*(factor3*ddsc3_2 + factor5*ddsc5_2);
+    real gl0 = atom1.posq.w*atom2.posq.w;
+    real gl1 = atom2.posq.w*sc3 - atom1.posq.w*sc4;
+    real gl2 = -sc3*sc4;
+    real gl6 = sc2;
+    real gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*gl2;
+#endif
+#if defined F1 || defined T1
+    real gf2 = -atom2.posq.w*rr3 + sc4*rr5;
+    real gf5 = 2*(-atom2.posq.w*rr5+sc4*rr7);
+#endif
+#if defined F1 || defined T3
+    real gf3 =  atom1.posq.w*rr3 + sc3*rr5;
+    real gf6 = 2*(-atom1.posq.w*rr5-sc3*rr7);
+#endif
+#ifdef F1
+    real em = mScale*(rr1*gl0 + rr3*(gl1+gl6) + rr5*gl2);
+    real ei = 0.5f*((gli1+gli6)*psc3 + gli2*psc5);
+    energy = em+ei;
+#endif
+#ifdef F1
+    real ftm2_0 = mScale*(gf1*xr + gf2*atom1.dipole.x + gf3*atom2.dipole.x);
+    real ftm2_1 = mScale*(gf1*yr + gf2*atom1.dipole.y + gf3*atom2.dipole.y);
+    real ftm2_2 = mScale*(gf1*zr + gf2*atom1.dipole.z + gf3*atom2.dipole.z);
+    real gfi1 = rr2*(1.5f*((gli1+gli6)*psc3 + (glip1+glip6)*dsc3 + scip2*scale3i) + 2.5f*(gli2*psc5 + glip2*dsc5 - (sci3*scip4+scip3*sci4)*scale5i));
+    ftm2i_0 += gfi1*xr;
+    ftm2i_1 += gfi1*yr;
+    ftm2i_2 += gfi1*zr;
+#endif
+#if defined F1 || defined T1
+    real gfi5 = (sci4*psc7 + scip4*dsc7);
+#endif
+#if defined F1 || defined T3
+    real gfi6 = -(sci3*psc7 + scip3*dsc7);
+#endif
+#ifdef F1
+    ftm2i_0 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.x*psc3 + atom1.inducedDipolePolar.x*dsc3) +
+               sc4*(atom1.inducedDipole.x*psc5 + atom1.inducedDipolePolar.x*dsc5)) +
+               0.5f*(atom1.posq.w*(atom2.inducedDipole.x*psc3+atom2.inducedDipolePolar.x*dsc3) +
+               sc3*(atom2.inducedDipole.x*psc5 +atom2.inducedDipolePolar.x*dsc5)) +
+               scale5i*(sci4*atom1.inducedDipolePolar.x+scip4*atom1.inducedDipole.x +
+                        sci3*atom2.inducedDipolePolar.x+scip3*atom2.inducedDipole.x)*0.5f +
+               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.x +
+               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.x;
+    ftm2i_1 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.y*psc3 + atom1.inducedDipolePolar.y*dsc3) +
+               sc4*(atom1.inducedDipole.y*psc5 + atom1.inducedDipolePolar.y*dsc5)) +
+               (atom1.posq.w*(atom2.inducedDipole.y*psc3+atom2.inducedDipolePolar.y*dsc3) +
+                    sc3*(atom2.inducedDipole.y*psc5+atom2.inducedDipolePolar.y*dsc5))*0.5f +
+                    scale5i*(sci4*atom1.inducedDipolePolar.y+scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y+scip3*atom2.inducedDipole.y)*0.5f +
+               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.y +
+               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.y;
+    ftm2i_2 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.z*psc3 + atom1.inducedDipolePolar.z*dsc3) +
+               sc4*(atom1.inducedDipole.z*psc5 + atom1.inducedDipolePolar.z*dsc5)) +
+               (atom1.posq.w*(atom2.inducedDipole.z*psc3+atom2.inducedDipolePolar.z*dsc3) +
+                    sc3*(atom2.inducedDipole.z*psc5+atom2.inducedDipolePolar.z*dsc5))*0.5f +
+                    scale5i*(sci4*atom1.inducedDipolePolar.z+scip4*atom1.inducedDipole.z +
+                    sci3*atom2.inducedDipolePolar.z+scip3*atom2.inducedDipole.z)*0.5f +
+               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.z +
+               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.z;
+#ifdef DIRECT_POLARIZATION
+    real gfd = 0.5*(3*rr2*scip2*scale3i - 5*rr2*(scip3*sci4+sci3*scip4)*scale5i);
+    real temp5 = 0.5*scale5i;
+    real fdir_0 = gfd*xr + temp5*(sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x + sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
+    real fdir_1 = gfd*yr + temp5*(sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
+    real fdir_2 = gfd*zr + temp5*(sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z + sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
+    ftm2i_0 -= fdir_0;
+    ftm2i_1 -= fdir_1;
+    ftm2i_2 -= fdir_2;
+#else
+    real scaleF = 0.5f*uScale;
+    real inducedFactor3 = scip2*rr3*scaleF;
+    real inducedFactor5 = (sci3*scip4+scip3*sci4)*rr5*scaleF;
+    real findmp_0 = inducedFactor3*ddsc3_0 - inducedFactor5*ddsc5_0;
+    real findmp_1 = inducedFactor3*ddsc3_1 - inducedFactor5*ddsc5_1;
+    real findmp_2 = inducedFactor3*ddsc3_2 - inducedFactor5*ddsc5_2;
+    ftm2i_0 -= findmp_0;
+    ftm2i_1 -= findmp_1;
+    ftm2i_2 -= findmp_2;
+#endif
+#endif
+#if defined T1
+    real gti2 = 0.5f*(sci4*psc5+scip4*dsc5);
+    real gti5 = gfi5;
+#endif
+#if defined T3
+    real gti3 = 0.5f*(sci3*psc5+scip3*dsc5);
+    real gti6 = gfi6;
+#endif
+#if defined T1 || defined T3
+    real dixdk_0 = atom1.dipole.y*atom2.dipole.z - atom1.dipole.z*atom2.dipole.y;
+    real dixdk_1 = atom1.dipole.z*atom2.dipole.x - atom1.dipole.x*atom2.dipole.z;
+    real dixdk_2 = atom1.dipole.x*atom2.dipole.y - atom1.dipole.y*atom2.dipole.x;
+#if defined T1
+    real dixuk_0 = atom1.dipole.y*atom2.inducedDipole.z - atom1.dipole.z*atom2.inducedDipole.y;
+    real dixuk_1 = atom1.dipole.z*atom2.inducedDipole.x - atom1.dipole.x*atom2.inducedDipole.z;
+    real dixuk_2 = atom1.dipole.x*atom2.inducedDipole.y - atom1.dipole.y*atom2.inducedDipole.x;
+#endif
+#endif
+#ifdef T1
+    real dixukp_0 = atom1.dipole.y*atom2.inducedDipolePolar.z - atom1.dipole.z*atom2.inducedDipolePolar.y;
+    real dixukp_1 = atom1.dipole.z*atom2.inducedDipolePolar.x - atom1.dipole.x*atom2.inducedDipolePolar.z;
+    real dixukp_2 = atom1.dipole.x*atom2.inducedDipolePolar.y - atom1.dipole.y*atom2.inducedDipolePolar.x;
+#endif
+#ifdef T1
+    real dixr_0 = atom1.dipole.y*zr - atom1.dipole.z*yr;
+    real dixr_1 = atom1.dipole.z*xr - atom1.dipole.x*zr;
+    real dixr_2 = atom1.dipole.x*yr - atom1.dipole.y*xr;
+#endif
+#ifdef T1
+    real ttm2_0 = -rr3*dixdk_0 + gf2*dixr_0;
+    real ttm2_1 = -rr3*dixdk_1 + gf2*dixr_1;
+    real ttm2_2 = -rr3*dixdk_2 + gf2*dixr_2;
+    real ttm2i_0 = -(dixuk_0*psc3+dixukp_0*dsc3)*0.5f + gti2*dixr_0;
+    real ttm2i_1 = -(dixuk_1*psc3+dixukp_1*dsc3)*0.5f + gti2*dixr_1;
+    real ttm2i_2 = -(dixuk_2*psc3+dixukp_2*dsc3)*0.5f + gti2*dixr_2;
+#endif
+#ifdef T3
+    real dkxr_0 = atom2.dipole.y*zr - atom2.dipole.z*yr;
+    real dkxr_1 = atom2.dipole.z*xr - atom2.dipole.x*zr;
+    real dkxr_2 = atom2.dipole.x*yr - atom2.dipole.y*xr;
+    real dkxui_0 = atom2.dipole.y*atom1.inducedDipole.z - atom2.dipole.z*atom1.inducedDipole.y;
+    real dkxui_1 = atom2.dipole.z*atom1.inducedDipole.x - atom2.dipole.x*atom1.inducedDipole.z; 
+    real dkxui_2 = atom2.dipole.x*atom1.inducedDipole.y - atom2.dipole.y*atom1.inducedDipole.x;
+    real dkxuip_0 = atom2.dipole.y*atom1.inducedDipolePolar.z - atom2.dipole.z*atom1.inducedDipolePolar.y;
+    real dkxuip_1 = atom2.dipole.z*atom1.inducedDipolePolar.x - atom2.dipole.x*atom1.inducedDipolePolar.z;
+    real dkxuip_2 = atom2.dipole.x*atom1.inducedDipolePolar.y - atom2.dipole.y*atom1.inducedDipolePolar.x;
+    real ttm3_0 =  rr3*dixdk_0 + gf3*dkxr_0;
+    real ttm3_1 =  rr3*dixdk_1 + gf3*dkxr_1;
+    real ttm3_2 =  rr3*dixdk_2 + gf3*dkxr_2;
+    real ttm3i_0 = -(dkxui_0*psc3+ dkxuip_0*dsc3)*0.5f + gti3*dkxr_0;
+    real ttm3i_1 = -(dkxui_1*psc3+ dkxuip_1*dsc3)*0.5f + gti3*dkxr_1;
+    real ttm3i_2 = -(dkxui_2*psc3+ dkxuip_2*dsc3)*0.5f + gti3*dkxr_2;
+#endif
+    if (mScale < 1) {
+#ifdef T1
+        ttm2_0 *= mScale;
+        ttm2_1 *= mScale;
+        ttm2_2 *= mScale;
+#endif
+#ifdef T3
+        ttm3_0 *= mScale;
+        ttm3_1 *= mScale;
+        ttm3_2 *= mScale;
+#endif
+    }
+#ifdef F1
+    outputForce.x = -(ftm2_0+ftm2i_0);
+    outputForce.y = -(ftm2_1+ftm2i_1);
+    outputForce.z = -(ftm2_2+ftm2i_2);
+#endif
+#ifdef T1
+    outputForce.x = (ttm2_0 + ttm2i_0);
+    outputForce.y = (ttm2_1 + ttm2i_1);
+    outputForce.z = (ttm2_2 + ttm2i_2);
+#endif
+#ifdef T3
+    outputForce.x = (ttm3_0 + ttm3i_0);
+    outputForce.y = (ttm3_1 + ttm3i_1);
+    outputForce.z = (ttm3_2 + ttm3i_2);
+#endif
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
@@ -3,8 +3,10 @@
 typedef struct {
    real4 posq;
    real3 force, dipole, inducedDipole, inducedDipolePolar;
+#ifdef INCLUDE_QUADRUPOLES
    real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
    real quadrupoleYY, quadrupoleYZ;
+#endif
    float thole, damp;
 } AtomData;
@@ -18,11 +20,13 @@ inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __res
    data.dipole.x = labFrameDipole[atom*3];
    data.dipole.y = labFrameDipole[atom*3+1];
    data.dipole.z = labFrameDipole[atom*3+2];
+#ifdef INCLUDE_QUADRUPOLES
    data.quadrupoleXX = labFrameQuadrupole[atom*5];
    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
+#endif
    data.inducedDipole.x = inducedDipole[atom*3];
    data.inducedDipole.y = inducedDipole[atom*3+1];
    data.inducedDipole.z = inducedDipole[atom*3+2];
@@ -92,11 +96,13 @@ extern "C" __global__ void computeElectrostatics(
            localData[threadIdx.x].posq = data.posq;
            localData[threadIdx.x].dipole = data.dipole;
+#ifdef INCLUDE_QUADRUPOLES
            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+#endif
            localData[threadIdx.x].inducedDipole = data.inducedDipole;
            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
            localData[threadIdx.x].thole = data.thole;

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
@@ -3,8 +3,10 @@
 typedef struct {
    real4 posq;
    real3 field, fieldPolar, dipole;
+#ifdef INCLUDE_QUADRUPOLES
    real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
    real quadrupoleYY, quadrupoleYZ, quadrupoleZZ;
+#endif
    float thole, damp;
 #ifdef USE_GK
    real3 gkField;
@@ -17,12 +19,14 @@ inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __res
    data.dipole.x = labFrameDipole[atom*3];
    data.dipole.y = labFrameDipole[atom*3+1];
    data.dipole.z = labFrameDipole[atom*3+2];
+#ifdef INCLUDE_QUADRUPOLES
    data.quadrupoleXX = labFrameQuadrupole[atom*5];
    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
    data.quadrupoleZZ = -(data.quadrupoleXX+data.quadrupoleYY);
+#endif
    float2 temp = dampingAndThole[atom];
    data.damp = temp.x;
    data.thole = temp.y;
@@ -85,15 +89,15 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
        real prr7 = 15*(1-psc7)/r7;
        real dir = dot(atom1.dipole, deltaR);
+        real dkr = dot(atom2.dipole, deltaR);
+#ifdef INCLUDE_QUADRUPOLES
        real3 qi;
        qi.x = atom1.quadrupoleXX*deltaR.x + atom1.quadrupoleXY*deltaR.y + atom1.quadrupoleXZ*deltaR.z;
        qi.y = atom1.quadrupoleXY*deltaR.x + atom1.quadrupoleYY*deltaR.y + atom1.quadrupoleYZ*deltaR.z;
        qi.z = atom1.quadrupoleXZ*deltaR.x + atom1.quadrupoleYZ*deltaR.y + atom1.quadrupoleZZ*deltaR.z;
        real qir = dot(qi, deltaR);
-        real dkr = dot(atom2.dipole, deltaR);
        real3 qk;
        qk.x = atom2.quadrupoleXX*deltaR.x + atom2.quadrupoleXY*deltaR.y + atom2.quadrupoleXZ*deltaR.z;
        qk.y = atom2.quadrupoleXY*deltaR.x + atom2.quadrupoleYY*deltaR.y + atom2.quadrupoleYZ*deltaR.z;
@@ -106,7 +110,14 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
        real3 fkd = deltaR*(drr3*atom1.posq.w+drr5*dir+drr7*qir) - drr3*atom1.dipole - 2*drr5*qi;
        real3 fip = -deltaR*(prr3*atom2.posq.w-prr5*dkr+prr7*qkr) - prr3*atom2.dipole + 2*prr5*qk;
        real3 fkp = deltaR*(prr3*atom1.posq.w+prr5*dir+prr7*qir) - prr3*atom1.dipole - 2*prr5*qi;
+#else
+        real3 fim = -deltaR*(bn1*atom2.posq.w-bn2*dkr) - bn1*atom2.dipole;
+        real3 fkm = deltaR*(bn1*atom1.posq.w+bn2*dir) - bn1*atom1.dipole;
+        real3 fid = -deltaR*(drr3*atom2.posq.w-drr5*dkr) - drr3*atom2.dipole;
+        real3 fkd = deltaR*(drr3*atom1.posq.w+drr5*dir) - drr3*atom1.dipole;
+        real3 fip = -deltaR*(prr3*atom2.posq.w-prr5*dkr) - prr3*atom2.dipole;
+        real3 fkp = deltaR*(prr3*atom1.posq.w+prr5*dir) - prr3*atom1.dipole;
+#endif
        // increment the field at each site due to this interaction
        fields[0] = fim-fid;
@@ -153,29 +164,34 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    real rr5_2 = 2*rr5;
-    real3 qDotDelta;
+    real dir = dot(atom1.dipole, deltaR);
-    qDotDelta.x = deltaR.x*atom2.quadrupoleXX + deltaR.y*atom2.quadrupoleXY + deltaR.z*atom2.quadrupoleXZ;
+    real dkr = dot(atom2.dipole, deltaR);
-    qDotDelta.y = deltaR.x*atom2.quadrupoleXY + deltaR.y*atom2.quadrupoleYY + deltaR.z*atom2.quadrupoleYZ;
-    qDotDelta.z = deltaR.x*atom2.quadrupoleXZ + deltaR.y*atom2.quadrupoleYZ + deltaR.z*atom2.quadrupoleZZ;
+#ifdef INCLUDE_QUADRUPOLES
+    real3 qi;
-    real dotdd = dot(deltaR, atom2.dipole);
+    qi.x = atom1.quadrupoleXX*deltaR.x + atom1.quadrupoleXY*deltaR.y + atom1.quadrupoleXZ*deltaR.z;
-    real dotqd = dot(deltaR, qDotDelta);
+    qi.y = atom1.quadrupoleXY*deltaR.x + atom1.quadrupoleYY*deltaR.y + atom1.quadrupoleYZ*deltaR.z;
+    qi.z = atom1.quadrupoleXZ*deltaR.x + atom1.quadrupoleYZ*deltaR.y + atom1.quadrupoleZZ*deltaR.z;
-    real factor = -rr3*atom2.posq.w + rr5*dotdd - rr7*dotqd;
+    real qir = dot(qi, deltaR);
-    real3 field1 = deltaR*factor - rr3*atom2.dipole + rr5_2*qDotDelta;
+    real3 qk;
+    qk.x = atom2.quadrupoleXX*deltaR.x + atom2.quadrupoleXY*deltaR.y + atom2.quadrupoleXZ*deltaR.z;
+    qk.y = atom2.quadrupoleXY*deltaR.x + atom2.quadrupoleYY*deltaR.y + atom2.quadrupoleYZ*deltaR.z;
+    qk.z = atom2.quadrupoleXZ*deltaR.x + atom2.quadrupoleYZ*deltaR.y + atom2.quadrupoleZZ*deltaR.z;
+    real qkr = dot(qk, deltaR);
+    real factor = -rr3*atom2.posq.w + rr5*dkr - rr7*qkr;
+    real3 field1 = deltaR*factor - rr3*atom2.dipole + rr5_2*qk;
+    factor = rr3*atom1.posq.w + rr5*dir + rr7*qir;
+    real3 field2 = deltaR*factor - rr3*atom1.dipole - rr5_2*qi;
+#else
+    real factor = -rr3*atom2.posq.w + rr5*dkr;
+    real3 field1 = deltaR*factor - rr3*atom2.dipole;
+    factor = rr3*atom1.posq.w + rr5*dir;
+    real3 field2 = deltaR*factor - rr3*atom1.dipole;
+#endif
    fields[0] = dScale*field1;
    fields[1] = pScale*field1;
-    qDotDelta.x = deltaR.x*atom1.quadrupoleXX + deltaR.y*atom1.quadrupoleXY + deltaR.z*atom1.quadrupoleXZ;
-    qDotDelta.y = deltaR.x*atom1.quadrupoleXY + deltaR.y*atom1.quadrupoleYY + deltaR.z*atom1.quadrupoleYZ;
-    qDotDelta.z = deltaR.x*atom1.quadrupoleXZ + deltaR.y*atom1.quadrupoleYZ + deltaR.z*atom1.quadrupoleZZ;
-    dotdd = dot(deltaR, atom1.dipole);
-    dotqd = dot(deltaR, qDotDelta);
-    factor = rr3*atom1.posq.w + rr5*dotdd + rr7*dotqd;
-    real3 field2 = deltaR*factor - rr3*atom1.dipole - rr5_2*qDotDelta;
    fields[2] = dScale*field2;
    fields[3] = pScale*field2;
 }
@@ -200,6 +216,7 @@ __device__ void computeOneGkInteraction(AtomData& atom1, AtomData& atom2, real3
    real uyk = atom2.dipole.y;
    real uzk = atom2.dipole.z;
+#ifdef INCLUDE_QUADRUPOLES
    real qxxi = atom1.quadrupoleXX;
    real qxyi = atom1.quadrupoleXY;
    real qxzi = atom1.quadrupoleXZ;
@@ -212,7 +229,20 @@ __device__ void computeOneGkInteraction(AtomData& atom1, AtomData& atom2, real3
    real qyyk = atom2.quadrupoleYY;
    real qyzk = atom2.quadrupoleYZ;
    real qzzk = atom2.quadrupoleZZ;
+#else
+    real qxxi = 0;
+    real qxyi = 0;
+    real qxzi = 0;
+    real qyyi = 0;
+    real qyzi = 0;
+    real qzzi = 0;
+    real qxxk = 0;
+    real qxyk = 0;
+    real qxzk = 0;
+    real qyyk = 0;
+    real qyzk = 0;
+    real qzzk = 0;
+#endif
    real xr2 = delta.x*delta.x;
    real yr2 = delta.y*delta.y;
    real zr2 = delta.z*delta.z;
@@ -438,12 +468,14 @@ extern "C" __global__ void computeFixedField(
            const unsigned int localAtomIndex = threadIdx.x;
            localData[localAtomIndex].posq = data.posq;
            localData[localAtomIndex].dipole = data.dipole;
+#ifdef INCLUDE_QUADRUPOLES
            localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
            localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
            localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
            localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
            localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
            localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
+#endif
            localData[localAtomIndex].thole = data.thole;
            localData[localAtomIndex].damp = data.damp;
 #ifdef USE_GK

--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForceNoQuadrupoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeElectrostaticPairForceNoQuadrupoles.cu
+__device__ void
+#ifdef APPLY_SCALE
+computeOneInteractionF1(
+#else
+computeOneInteractionF1NoScale(
+#endif
+        AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor,
+#ifdef APPLY_SCALE
+        float dScale, float pScale, float mScale,
+#endif
+        real3& force, real& energy) {
+    real xr = delta.x;
+    real yr = delta.y;
+    real zr = delta.z;
+#ifdef APPLY_SCALE
+    real rr1 = delta.w;
+#endif
+    // set the permanent multipole and induced dipole values;
+    real ci = atom1.q;
+    real di1 = atom1.dipole.x;
+    real di2 = atom1.dipole.y;
+    real di3 = atom1.dipole.z;
+    real ck = atom2.q;
+    real dk1 = atom2.dipole.x;
+    real dk2 = atom2.dipole.y;
+    real dk3 = atom2.dipole.z;
+    real bn1 = bn.x;
+    real bn2 = bn.y;
+    real bn3 = bn.z;
+    real bn4 = bn.w;
+#ifdef APPLY_SCALE
+    real offset = 1-mScale;
+    real rr3 = rr1*rr1*rr1;
+    real gf4 = 2*(bn2 - 3*offset*rr3*rr1*rr1);
+#else
+    real gf4 = 2*bn2;
+#endif
+    real ftm21 = 0;
+    real ftm22 = 0;
+    real ftm23 = 0;
+    // calculate the scalar products for permanent components
+    real gl6 = di1*dk1 + di2*dk2 + di3*dk3;
+    real sc3 = di1*xr + di2*yr + di3*zr;
+    real sc4 = dk1*xr + dk2*yr + dk3*zr;
+    real gl0 = ci*ck;
+    real gl1 = ck*sc3 - ci*sc4;
+    real gl2 = -sc3*sc4;
+#ifdef APPLY_SCALE
+    energy += forceFactor*(-offset*rr1*gl0 + (bn1-offset*rr3)*(gl1+gl6) + (bn2-offset*(3*rr3*rr1*rr1))*gl2);
+#else
+    energy += forceFactor*(bn1*(gl1+gl6) + bn2*gl2);
+#endif
+    real gf1 = bn1*gl0 + bn2*(gl1+gl6) + bn3*gl2;
+#ifdef APPLY_SCALE
+    gf1 -= offset*(rr3*gl0 + (3*rr3*rr1*rr1)*(gl1+gl6) + (15*rr3*rr3*rr1)*gl2);
+#endif
+    ftm21 += gf1*xr;
+    ftm22 += gf1*yr;
+    ftm23 += gf1*zr;
+#ifdef APPLY_SCALE
+    real gf2 = -ck*bn1 + sc4*bn2 - offset*(-ck*rr3 + sc4*(3*rr3*rr1*rr1));
+#else
+    real gf2 = -ck*bn1 + sc4*bn2;
+#endif
+    ftm21 += gf2*di1;
+    ftm22 += gf2*di2;
+    ftm23 += gf2*di3;
+#ifdef APPLY_SCALE
+    real gf3 = ci*bn1 + sc3*bn2 - offset*(ci*rr3 + sc3*(3*rr3*rr1*rr1));
+#else
+    real gf3 = ci*bn1 + sc3*bn2;
+#endif
+    ftm21 += gf3*dk1;
+    ftm22 += gf3*dk2;
+    ftm23 += gf3*dk3;
+    force.x = ftm21;
+    force.y = ftm22;
+    force.z = ftm23;
+}
+__device__ void
+#ifdef APPLY_SCALE
+computeOneInteractionF2(
+#else
+computeOneInteractionF2NoScale(
+#endif
+        AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, float forceFactor,
+#ifdef APPLY_SCALE
+        float dScale, float pScale, float mScale,
+#endif
+        real3& force, real& energy) {
+    const float uScale = 1;
+    real xr = delta.x;
+    real yr = delta.y;
+    real zr = delta.z;
+    real rr1 = delta.w;
+    // set the permanent multipole and induced dipole values;
+    real ci = atom1.q;
+    real di1 = atom1.dipole.x;
+    real di2 = atom1.dipole.y;
+    real di3 = atom1.dipole.z;
+    real bn1 = bn.x;
+    real bn2 = bn.y;
+    real bn3 = bn.z;
+    real bn4 = bn.w;
+    real damp = atom1.damp*atom2.damp;
+    if (damp != 0) {
+        real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
+        real ratio = RECIP(rr1*damp);
+        damp = -pgamma*ratio*ratio*ratio;
+        damp = damp < -50 ? 0 : damp;
+    }
+    real scale5 = (damp == 0) ? 1 : (1 - (1-damp)*EXP(damp));
+    real rr5 = rr1*rr1;
+          rr5 = 3*rr1*rr5*rr5;
+#ifdef APPLY_SCALE
+    real psc5 = rr5*(1 - scale5*pScale);
+    real dsc5 = rr5*(1 - scale5*dScale);
+    real usc5 = rr5*(1 - scale5*uScale);
+#else
+    real psc5 = rr5*(1 - scale5);
+#endif
+    real ftm21 = 0;
+    real ftm22 = 0;
+    real ftm23 = 0;
+    real expdamp = EXP(damp);
+    real scale3 = (damp == 0) ? 1 : (1 - expdamp);
+    real rr3 = rr1*rr1*rr1;
+#ifdef APPLY_SCALE
+    real psc3 = rr3*(1 - scale3*pScale);
+    real dsc3 = rr3*(1 - scale3*dScale);
+    real usc3 = rr3*(1 - scale3*uScale);
+#else
+    real psc3 = rr3*(1 - scale3);
+#endif
+    real scale7 = (damp == 0) ? 1 : (1 - (1-damp+0.6f*damp*damp)*expdamp);
+#ifdef APPLY_SCALE
+    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7*pScale);
+    real dsc7 = (15*rr3*rr3*rr1)*(1 - scale7*dScale);
+#else
+    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7);
+#endif
+    real sc3 = di1*xr + di2*yr + di3*zr;
+    real gfi3 = ci*bn1 + sc3*bn2;
+    real prefactor1;
+    prefactor1 = 0.5f*(ci*psc3 + sc3*psc5 - gfi3);
+    ftm21 -= prefactor1*atom2.inducedDipole.x;
+    ftm22 -= prefactor1*atom2.inducedDipole.y;
+    ftm23 -= prefactor1*atom2.inducedDipole.z;
+#ifdef APPLY_SCALE
+    prefactor1 = 0.5f*(ci*dsc3 + sc3*dsc5 - gfi3);
+#endif
+    ftm21 -= prefactor1*atom2.inducedDipolePolar.x;
+    ftm22 -= prefactor1*atom2.inducedDipolePolar.y;
+    ftm23 -= prefactor1*atom2.inducedDipolePolar.z;
+    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
+    energy += forceFactor*0.5f*sci4*((psc3-bn1)*ci + (psc5-bn2)*sc3);
+    real scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
+#ifndef DIRECT_POLARIZATION
+#ifdef APPLY_SCALE
+    prefactor1 = 0.5f*(bn2 - usc5);
+#else
+    prefactor1 = 0.5f*(bn2 - psc5);
+#endif
+    ftm21 += prefactor1*((sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x));
+    ftm22 += prefactor1*((sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y));
+    ftm23 += prefactor1*((sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z));
+#endif
+#ifdef APPLY_SCALE
+    prefactor1 = 0.5f*(bn2*(sci4+scip4) - (sci4*psc5+scip4*dsc5)); 
+#else
+    sci4 += scip4;
+    prefactor1 = 0.5f*sci4*(bn2 - psc5); 
+#endif
+    ftm21 += prefactor1*di1;
+    ftm22 += prefactor1*di2;
+    ftm23 += prefactor1*di3;
+#ifdef APPLY_SCALE
+    real gli1 = -ci*sci4;
+    real gli2 = -sc3*sci4;
+    real glip1 = -ci*scip4;
+    real glip2 = -sc3*scip4;
+#else
+    real gli1 = -ci*sci4;
+    real gli2 = -sc3*sci4;
+#endif
+#ifdef APPLY_SCALE
+    real gfi1 = (bn2*(gli1+glip1) + bn3*(gli2+glip2));
+    gfi1 -= (rr1*rr1)*(3*(gli1*psc3 + glip1*dsc3) + 5*(gli2*psc5 + glip2*dsc5));
+#else
+    real gfi1 = bn2*gli1 + bn3*gli2;
+    gfi1 -= (rr1*rr1)*(3*gli1*psc3 + 5*gli2*psc5);
+#endif
+    gfi1 *= 0.5f;
+    ftm21 += gfi1*xr;
+    ftm22 += gfi1*yr;
+    ftm23 += gfi1*zr;
+    if (damp != 0) {
+        real expdamp = EXP(damp);
+        real temp3 = -1.5f*damp*expdamp*rr1*rr1;
+        real temp5 = -damp;
+        real temp7 = -0.2f - 0.6f*damp;
+        real ddsc31 = temp3*xr;
+        real ddsc32 = temp3*yr;
+        real ddsc33 = temp3*zr;
+        real ddsc51 = temp5*ddsc31;
+        real ddsc52 = temp5*ddsc32;
+        real ddsc53 = temp5*ddsc33;
+        real ddsc71 = temp7*ddsc51;
+        real ddsc72 = temp7*ddsc52;
+        real ddsc73 = temp7*ddsc53;
+        real rr3 = rr1*rr1*rr1;
+#ifdef APPLY_SCALE
+        temp3 = (gli1*pScale + glip1*dScale);
+        temp5 = (3*rr1*rr1)*(gli2*pScale + glip2*dScale);
+#else
+        temp3 = gli1;
+        temp5 = (3*rr1*rr1)*gli2;
+#endif
+        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
+        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
+        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
+    }
+//K
+    real dk1 = atom2.dipole.x;
+    real dk2 = atom2.dipole.y;
+    real dk3 = atom2.dipole.z;
+    real sc4 =  dk1*xr +  dk2*yr +  dk3*zr;
+    real ck = atom2.q;
+    real gfi2 = (-ck*bn1 + sc4*bn2);
+    prefactor1 = 0.5f*(ck*psc3 - sc4*psc5 + gfi2);
+    ftm21 += prefactor1*atom1.inducedDipole.x;
+    ftm22 += prefactor1*atom1.inducedDipole.y;
+    ftm23 += prefactor1*atom1.inducedDipole.z;
+#ifdef APPLY_SCALE
+    prefactor1 = 0.5f*(ck*dsc3 - sc4*dsc5 + gfi2);
+#endif
+    ftm21 += prefactor1*atom1.inducedDipolePolar.x;
+    ftm22 += prefactor1*atom1.inducedDipolePolar.y;
+    ftm23 += prefactor1*atom1.inducedDipolePolar.z;
+    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
+    energy += forceFactor*0.5f*sci3*(ck*(bn1-psc3) - sc4*(bn2-psc5));
+    real scip3 = atom1.inducedDipolePolar.x*xr + atom1.inducedDipolePolar.y*yr + atom1.inducedDipolePolar.z*zr;
+#ifndef DIRECT_POLARIZATION
+#ifdef APPLY_SCALE
+    prefactor1 = 0.5f*(bn2 - usc5);
+#else
+    prefactor1 = 0.5f*(bn2 - psc5);
+#endif
+    ftm21 += prefactor1*(sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
+    ftm22 += prefactor1*(sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
+    ftm23 += prefactor1*(sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
+    real sci34;
+    sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
+    scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
+    sci34 = (sci3*scip4+scip3*sci4);
+#ifdef APPLY_SCALE
+    gfi1 = sci34*(usc5*(5*rr1*rr1) -bn3);
+#else
+    gfi1 = sci34*(psc5*(5*rr1*rr1) -bn3);
+#endif
+#else
+    gfi1 = 0;
+#endif
+#ifdef APPLY_SCALE
+    prefactor1 = 0.5f*(bn2*(sci3+scip3) - (sci3*psc5+scip3*dsc5));
+#else
+    sci3 += scip3;
+    prefactor1 = 0.5f*sci3*(bn2 - psc5);
+#endif
+    ftm21 += prefactor1*dk1;
+    ftm22 += prefactor1*dk2;
+    ftm23 += prefactor1*dk3;
+#ifdef APPLY_SCALE
+    real gfi6 = -bn3*(sci3+scip3) + (sci3*psc7+scip3*dsc7);
+#else
+    real gfi6 = sci3*(psc7 - bn3);
+#endif
+    real sci1 = atom1.inducedDipole.x*dk1 + atom1.inducedDipole.y*dk2 + atom1.inducedDipole.z*dk3 + di1*atom2.inducedDipole.x + di2*atom2.inducedDipole.y + di3*atom2.inducedDipole.z;
+    energy += forceFactor*0.5f*(sci1*(bn1-psc3));
+    real scip1 = atom1.inducedDipolePolar.x*dk1 + atom1.inducedDipolePolar.y*dk2 + atom1.inducedDipolePolar.z*dk3 + di1*atom2.inducedDipolePolar.x + di2*atom2.inducedDipolePolar.y + di3*atom2.inducedDipolePolar.z;
+#ifndef APPLY_SCALE
+        sci1 += scip1;
+#endif
+    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x +
+                                  atom1.inducedDipole.y*atom2.inducedDipolePolar.y +
+                                  atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
+                                  atom2.inducedDipole.x*atom1.inducedDipolePolar.x +
+                                  atom2.inducedDipole.y*atom1.inducedDipolePolar.y +
+                                  atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
+           gli1 = ck*sci3 + sci1;
+           gli2 = -sci3*sc4;
+#ifdef APPLY_SCALE
+          glip1 = ck*scip3 + scip1;
+          glip2 = -scip3*sc4;
+#endif
+#ifdef APPLY_SCALE
+    gfi1 += (bn2*(gli1+glip1) + bn3*(gli2+glip2));
+    gfi1 -= (rr1*rr1)*(3*(gli1*psc3 + glip1*dsc3) + 5*(gli2*psc5 + glip2*dsc5));
+#else
+    gfi1 += (bn2*gli1 + bn3*gli2);
+    gfi1 -= (rr1*rr1)*(3*gli1*psc3 + 5*gli2*psc5);
+#endif
+#ifndef DIRECT_POLARIZATION
+#ifdef APPLY_SCALE
+    gfi1 += scip2*(bn2 - (3*rr1*rr1)*usc3);
+#else
+    gfi1 += scip2*(bn2 - (3*rr1*rr1)*psc3);
+#endif
+#endif
+    gfi1 *= 0.5f;
+    ftm21 += gfi1*xr;
+    ftm22 += gfi1*yr;
+    ftm23 += gfi1*zr;
+    if (damp != 0) {
+        real expdamp = EXP(damp);
+        real temp3 = -1.5f*damp*expdamp*rr1*rr1;
+        real temp5 = -damp;
+        real temp7 = -0.2f - 0.6f*damp;
+        real ddsc31 = temp3*xr;
+        real ddsc32 = temp3*yr;
+        real ddsc33 = temp3*zr;
+        real ddsc51 = temp5*ddsc31;
+        real ddsc52 = temp5*ddsc32;
+        real ddsc53 = temp5*ddsc33;
+        real ddsc71 = temp7*ddsc51;
+        real ddsc72 = temp7*ddsc52;
+        real ddsc73 = temp7*ddsc53;
+        real rr3 = rr1*rr1*rr1;
+#ifdef APPLY_SCALE
+        temp3 = gli1*pScale + glip1*dScale;
+        temp5 = (3*rr1*rr1)*(gli2*pScale + glip2*dScale);
+#else
+        temp3 = gli1;
+        temp5 = (3*rr1*rr1)*gli2;
+#endif
+        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
+        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
+        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
+#ifndef DIRECT_POLARIZATION
+#ifdef APPLY_SCALE
+        temp3 =  uScale*scip2;
+        temp5 = -(3*rr1*rr1)*uScale*sci34;
+#else
+        temp3 =  scip2;
+        temp5 = -(3*rr1*rr1)*sci34;
+#endif
+        ftm21 -= rr3*(temp3*ddsc31 + temp5*ddsc51);
+        ftm22 -= rr3*(temp3*ddsc32 + temp5*ddsc52);
+        ftm23 -= rr3*(temp3*ddsc33 + temp5*ddsc53);
+#endif
+    }
+    force.x += ftm21;
+    force.y += ftm22;
+    force.z += ftm23;
+}
+__device__ void
+#ifdef APPLY_SCALE
+computeOneInteractionT1(
+#else
+computeOneInteractionT1NoScale(
+#endif
+        AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
+#ifdef APPLY_SCALE
+        , float dScale, float pScale, float mScale
+#endif
+        ) {
+    real xr = delta.x;
+    real yr = delta.y;
+    real zr = delta.z;
+#ifdef APPLY_SCALE
+    real rr1 = delta.w;
+#endif
+    // set the permanent multipole and induced dipole values;
+    real di1 = atom1.dipole.x;
+    real di2 = atom1.dipole.y;
+    real di3 = atom1.dipole.z;
+    real ck = atom2.q;
+    real dk1 = atom2.dipole.x;
+    real dk2 = atom2.dipole.y;
+    real dk3 = atom2.dipole.z;
+    real bn1 = bn.x;
+    real bn2 = bn.y;
+    real bn3 = bn.z;
+    real bn4 = bn.w;
+    // apply Thole polarization damping to scale factors
+#ifdef APPLY_SCALE
+    real rr2 = rr1*rr1;
+    real rr3 = rr1*rr2;
+    real rr5 = 3*rr3*rr2;
+    real rr7 = 5*rr5*rr2;
+    real rr9 = 7*rr7*rr2;
+    real scale = 1-mScale;
+    real prefactor = scale*rr3 - bn1;
+#else
+    real prefactor = -bn1;
+#endif
+    real dixdk1 = di2*dk3 - di3*dk2;
+    real ttm21 = prefactor*dixdk1;
+    real dixdk2 = di3*dk1 - di1*dk3;
+    real ttm22 = prefactor*dixdk2;
+    real dixdk3 = di1*dk2 - di2*dk1;
+    real ttm23 = prefactor*dixdk3;
+    real sc4 = dk1*xr + dk2*yr + dk3*zr;
+    real sc6 = 0;
+    real gf2 = -ck*bn1 + sc4*bn2;
+#ifdef APPLY_SCALE
+    real gfr2 = -ck*rr3 + sc4*rr5;
+    prefactor = (gf2 - scale*gfr2);
+#else
+    prefactor = gf2;
+#endif
+    ttm21 += prefactor*(di2*zr - di3*yr);
+    ttm22 += prefactor*(di3*xr - di1*zr);
+    ttm23 += prefactor*(di1*yr - di2*xr);
+    atom1.torque.x += ttm21;
+    atom1.torque.y += ttm22;
+    atom1.torque.z += ttm23;
+}
+__device__ void
+#ifdef APPLY_SCALE
+computeOneInteractionT2(
+#else
+computeOneInteractionT2NoScale(
+#endif
+        AtomData& atom1, volatile AtomData& atom2, const real4 delta, const real4 bn
+#ifdef APPLY_SCALE
+        , float dScale, float pScale, float mScale
+#endif
+        ) {
+    real xr = delta.x;
+    real yr = delta.y;
+    real zr = delta.z;
+    real rr1 = delta.w;
+    // set the permanent multipole and induced dipole values;
+    real di1 = atom1.dipole.x;
+    real di2 = atom1.dipole.y;
+    real di3 = atom1.dipole.z;
+    real bn1 = bn.x;
+    real bn2 = bn.y;
+    real bn3 = bn.z;
+    // apply Thole polarization damping to scale factors
+    real scale3 = 1;
+    real scale5 = 1;
+    real scale7 = 1;
+    real damp = atom1.damp*atom2.damp;
+    if (damp != 0) {
+        real pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
+        real ratio = RECIP(rr1*damp);
+        damp = -pgamma*ratio*ratio*ratio;
+        real expdamp = EXP(damp);
+        scale3 = 1 - expdamp;
+        scale5 = 1 - (1-damp)*expdamp;
+        scale7 = 1 - (1-damp+0.6f*damp*damp)*expdamp;
+    }
+    real rr3 = rr1*rr1*rr1;
+#ifdef APPLY_SCALE
+    real dsc3 = rr3*(1 - scale3*dScale);
+    real dsc5 = (3*rr3*rr1*rr1)* (1 - scale5*dScale);
+    real dsc7 = (15*rr3*rr3*rr1)*(1 - scale7*dScale);
+    real psc3 = rr3*(1 - scale3*pScale);
+    real psc5 = (3*rr3*rr1*rr1)*(1 - scale5*pScale);
+    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7*pScale);
+#else
+    real psc3 = rr3*(1 - scale3);
+    real psc5 = (3*rr3*rr1*rr1)*(1 - scale5);
+    real psc7 = (15*rr3*rr3*rr1)*(1 - scale7);
+#endif
+    real prefactor1 = 0.5f*(psc3 - bn1);
+#ifdef APPLY_SCALE
+    real prefactor2 = 0.5f*(dsc3 - bn1);
+#endif
+    real dixuk1 = di2*atom2.inducedDipole.z - di3*atom2.inducedDipole.y;
+    real dixukp1 = di2*atom2.inducedDipolePolar.z - di3*atom2.inducedDipolePolar.y;
+#ifdef APPLY_SCALE
+    real ttm2i1 = prefactor1*dixuk1 + prefactor2*dixukp1;
+#else
+    real ttm2i1 = prefactor1*(dixuk1 + dixukp1);
+#endif
+    real dixuk2 = di3*atom2.inducedDipole.x - di1*atom2.inducedDipole.z;
+    real dixukp2 = di3*atom2.inducedDipolePolar.x - di1*atom2.inducedDipolePolar.z;
+#ifdef APPLY_SCALE
+    real ttm2i2 = prefactor1*dixuk2 + prefactor2*dixukp2;
+#else
+    real ttm2i2 = prefactor1*(dixuk2 + dixukp2);
+#endif
+    real dixuk3 = di1*atom2.inducedDipole.y - di2*atom2.inducedDipole.x;
+    real dixukp3 = di1*atom2.inducedDipolePolar.y - di2*atom2.inducedDipolePolar.x;
+#ifdef APPLY_SCALE
+    real ttm2i3 = prefactor1*dixuk3 + prefactor2*dixukp3;
+#else
+    real ttm2i3 = prefactor1*(dixuk3 + dixukp3);
+#endif
+    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
+    real scip4 = atom2.inducedDipolePolar.x*xr + atom2.inducedDipolePolar.y*yr + atom2.inducedDipolePolar.z*zr;
+    real gti2 = bn2*(sci4+scip4);
+#ifdef APPLY_SCALE
+    real gtri2 = (sci4*psc5+scip4*dsc5);
+#else
+    real gtri2 = psc5*(sci4+scip4);
+#endif
+    prefactor1 = 0.5f*(gti2 - gtri2);
+    ttm2i1 += prefactor1*(di2*zr - di3*yr);
+    ttm2i2 += prefactor1*(di3*xr - di1*zr);
+    ttm2i3 += prefactor1*(di1*yr - di2*xr);
+    atom1.torque.x += ttm2i1;
+    atom1.torque.y += ttm2i2;
+    atom1.torque.z += ttm2i3;
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
@@ -2,9 +2,12 @@
 typedef struct {
    real3 pos, force, torque, dipole, inducedDipole, inducedDipolePolar;
-    real q, quadrupoleXX, quadrupoleXY, quadrupoleXZ;
+    real q;
-    real quadrupoleYY, quadrupoleYZ;
+    float thole, damp;
-    float thole, damp, padding;
+#ifdef INCLUDE_QUADRUPOLES
+    real quadrupoleXX, quadrupoleXY, quadrupoleXZ, quadrupoleYY, quadrupoleYZ;
+    float padding;
+#endif
 } AtomData;
 __device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, real4 delta, real4 bn, real bn5, float forceFactor, float dScale, float pScale, float mScale, real3& force, real& energy);
@@ -24,11 +27,13 @@ inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __res
    data.dipole.x = labFrameDipole[atom*3];
    data.dipole.y = labFrameDipole[atom*3+1];
    data.dipole.z = labFrameDipole[atom*3+2];
+#ifdef INCLUDE_QUADRUPOLES
    data.quadrupoleXX = labFrameQuadrupole[atom*5];
    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
+#endif
    data.inducedDipole.x = inducedDipole[atom*3];
    data.inducedDipole.y = inducedDipole[atom*3+1];
    data.inducedDipole.z = inducedDipole[atom*3+2];
@@ -158,12 +163,16 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
    real fterm = -EWALD_ALPHA/SQRT_PI;
    real cii = atom1.q*atom1.q;
    real dii = dot(atom1.dipole, atom1.dipole);
+#ifdef INCLUDE_QUADRUPOLES
    real qii = 2*(atom1.quadrupoleXX*atom1.quadrupoleXX +
                  atom1.quadrupoleYY*atom1.quadrupoleYY +
                  atom1.quadrupoleXX*atom1.quadrupoleYY +
                  atom1.quadrupoleXY*atom1.quadrupoleXY +
                  atom1.quadrupoleXZ*atom1.quadrupoleXZ +
                  atom1.quadrupoleYZ*atom1.quadrupoleYZ);
+#else
+    real qii = 0;
+#endif
    real uii = dot(atom1.dipole, atom1.inducedDipole);
    real selfEnergy = (cii + term*(dii/3 + 2*term*qii/5));
    selfEnergy += term*uii/3;
@@ -216,11 +225,13 @@ extern "C" __global__ void computeElectrostatics(
            localData[threadIdx.x].pos = data.pos;
            localData[threadIdx.x].q = data.q;
            localData[threadIdx.x].dipole = data.dipole;
+#ifdef INCLUDE_QUADRUPOLES
            localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
            localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
            localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
            localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
            localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+#endif
            localData[threadIdx.x].inducedDipole = data.inducedDipole;
            localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
            localData[threadIdx.x].thole = data.thole;

--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaMultipoleForce.cpp
@@ -50,6 +50,8 @@
 using namespace OpenMM;
+using namespace std;
 const double TOL = 1e-4;
 extern "C" void registerAmoebaCudaKernelFactories();
@@ -2859,6 +2861,75 @@ static void testMultipoleGridPotential( FILE* log ) {
 }
+/**
+ * Test whether the alternate kernels with no quadrupoles work correctly.
+ */
+static void testNoQuadrupoles(bool usePme) {
+    string testName      = "testNoQuadrupoles";
+    int inputPmeGridDimension = 10;
+    double cutoff             = 0.3;
+    System system;
+    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
+    setupMultipoleAmmonia(system, amoebaMultipoleForce, usePme ? AmoebaMultipoleForce::PME : AmoebaMultipoleForce::NoCutoff, AmoebaMultipoleForce::Direct, 
+                                             cutoff, inputPmeGridDimension);
+    // First compute forces with quadrupoles.  This ensures they will be included in the kernel.
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName("CUDA"));
+    vector<Vec3> forces1;
+    double energy1;
+    getForcesEnergyMultipoleAmmonia(context, forces1, energy1);
+    // Now set all quadrupoles to zero and recalculate.
+    for (int i = 0; i < system.getNumParticles(); i++) {
+        double charge, thole, damping, polarity;
+        int axisType, atomX, atomY, atomZ;
+        vector<double> dipole, quadrupole;
+        amoebaMultipoleForce->getMultipoleParameters(i, charge, dipole, quadrupole, axisType, atomZ, atomX, atomY, thole, damping, polarity);
+        for (int j = 0; j < (int) quadrupole.size(); j++)
+            quadrupole[j] = 0;
+        amoebaMultipoleForce->setMultipoleParameters(i, charge, dipole, quadrupole, axisType, atomZ, atomX, atomY, thole, damping, polarity);
+    }
+    amoebaMultipoleForce->updateParametersInContext(context);
+    vector<Vec3> forces2;
+    double energy2;
+    getForcesEnergyMultipoleAmmonia(context, forces2, energy2);
+    ASSERT(energy1 != energy2);
+    // Create a new context with no quadrupoles from the beginning.
+    LangevinIntegrator integrator2(0.0, 0.1, 0.01);
+    Context context2(system, integrator2, Platform::getPlatformByName("CUDA"));
+    vector<Vec3> forces3;
+    double energy3;
+    getForcesEnergyMultipoleAmmonia(context2, forces3, energy3);
+    double tolerance = 1e-5;
+    compareForcesEnergy(testName, energy2, energy3, forces2, forces3, tolerance, NULL);
+    // If we try to set a quadrupole, that should produce and exception.
+    double charge, thole, damping, polarity;
+    int axisType, atomX, atomY, atomZ;
+    vector<double> dipole, quadrupole;
+    amoebaMultipoleForce->getMultipoleParameters(0, charge, dipole, quadrupole, axisType, atomZ, atomX, atomY, thole, damping, polarity);
+    quadrupole[0] = 1.0;
+    amoebaMultipoleForce->setMultipoleParameters(0, charge, dipole, quadrupole, axisType, atomZ, atomX, atomY, thole, damping, polarity);
+    bool threwException = false;
+    try {
+        amoebaMultipoleForce->updateParametersInContext(context2);
+    }
+    catch (OpenMMException& ex) {
+        threwException = true;
+    }
+    ASSERT(threwException);
+}
 int main(int argc, char* argv[]) {
    try {
        std::cout << "TestCudaAmoebaMultipoleForce running test..." << std::endl;
@@ -2902,6 +2973,9 @@ int main(int argc, char* argv[]) {
        // large box of water
        testPMEMutualPolarizationLargeWater( log );
+        testNoQuadrupoles(false);
+        testNoQuadrupoles(true);
    } catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;