Continuing to convert AmoebaMultipoleForce

776f6f22 · Peter Eastman · b186314c · 776f6f22 · 776f6f22 · 776f6f22
Commit 776f6f22 authored Jul 28, 2012 by Peter Eastman
8 changed files
--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
@@ -865,7 +865,7 @@ private:
 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : 
        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false),
-        multipoleParticles(NULL), torqueBufferIndices(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL),
+        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL),
        labFrameDipoles(NULL), labFrameQuadrupoles(NULL), field(NULL), fieldPolar(NULL), dampingAndThole(NULL),
        inducedDipole(NULL), inducedDipolePolar(NULL), currentEpsilon(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL) {
@@ -875,8 +875,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
    cu.setAsCurrent();
    if (multipoleParticles != NULL)
        delete multipoleParticles;
-    if (torqueBufferIndices != NULL)
-        delete torqueBufferIndices;
    if (molecularDipoles != NULL)
        delete molecularDipoles;
    if (molecularQuadrupoles != NULL)
@@ -933,8 +931,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        multipoleParticlesVec.push_back(make_int4(atomX, atomY, atomZ, axisType));
        for (int j = 0; j < 3; j++)
            molecularDipolesVec.push_back((float) dipole[j]);
-        for (int j = 0; j < 9; j++)
+        molecularQuadrupolesVec.push_back((float) quadrupole[0]);
-            molecularQuadrupolesVec.push_back((float) quadrupole[j]);
+        molecularQuadrupolesVec.push_back((float) quadrupole[1]);
+        molecularQuadrupolesVec.push_back((float) quadrupole[2]);
+        molecularQuadrupolesVec.push_back((float) quadrupole[4]);
+        molecularQuadrupolesVec.push_back((float) quadrupole[5]);
    }
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    for (int i = numMultipoles; i < paddedNumAtoms; i++) {
@@ -943,14 +944,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        multipoleParticlesVec.push_back(make_int4(0, 0, 0, 0));
        for (int j = 0; j < 3; j++)
            molecularDipolesVec.push_back(0);
-        for (int j = 0; j < 9; j++)
+        for (int j = 0; j < 5; j++)
            molecularQuadrupolesVec.push_back(0);
    }
    dampingAndThole = CudaArray::create<float2>(cu, paddedNumAtoms, "dampingAndThole");
    polarizability = CudaArray::create<float>(cu, paddedNumAtoms, "polarizability");
    multipoleParticles = CudaArray::create<int4>(cu, paddedNumAtoms, "multipoleParticles");
    molecularDipoles = CudaArray::create<float>(cu, 3*paddedNumAtoms, "molecularDipoles");
-    molecularQuadrupoles = CudaArray::create<float>(cu, 9*paddedNumAtoms, "molecularQuadrupoles");
+    molecularQuadrupoles = CudaArray::create<float>(cu, 5*paddedNumAtoms, "molecularQuadrupoles");
    dampingAndThole->upload(dampingAndTholeVec);
    polarizability->upload(polarizabilityVec);
    multipoleParticles->upload(multipoleParticlesVec);
@@ -962,7 +963,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
    labFrameDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles");
-    labFrameQuadrupoles = new CudaArray(cu, 9*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
+    labFrameQuadrupoles = new CudaArray(cu, 5*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
    field = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "field");
    fieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "fieldPolar");
    inducedDipole = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipole");
@@ -999,6 +1000,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            polarizationFlagValues.push_back(make_int2(i, atoms[j]));
    }
+    // Record other options.
+    if (force.getPolarizationType() == AmoebaMultipoleForce::Mutual) {
+        maxInducedIterations = force.getMutualInducedMaxIterations();
+        inducedEpsilon = force.getMutualInducedTargetEpsilon();
+    }
+    else
+        maxInducedIterations == 0;
    // Create the kernels.
@@ -1007,13 +1017,29 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(numMultipoles);
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-    defines["SCALING_DISTANCE_CUTOFF"] = cu.doubleToString(50.0);
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+    defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456); // DIVIDE BY INNER DIELECTRIC!!!
+    if (force.getPolarizationType() == AmoebaMultipoleForce::Direct)
+        defines["DIRECT_POLARIZATION"] = "";
    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoles, defines);
    computeMomentsKernel = cu.getKernel(module, "computeLabFrameMoments");
+    recordInducedDipolesKernel = cu.getKernel(module, "recordInducedDipoles");
    module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines);
    computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
+    stringstream electrostaticsSource;
+    electrostaticsSource << CudaKernelSources::vectorOps;
+    electrostaticsSource << CudaAmoebaKernelSources::multipoleElectrostatics;
+    electrostaticsSource << "#define F1\n";
+    electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+    electrostaticsSource << "#undef F1\n";
+    electrostaticsSource << "#define T1\n";
+    electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+    electrostaticsSource << "#undef T1\n";
+    electrostaticsSource << "#define T2\n";
+    electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+    module = cu.createModule(electrostaticsSource.str(), defines);
+    electrostaticsKernel = cu.getKernel(module, "computeElectrostatics");
    // Set up PME.
@@ -1389,30 +1415,34 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
    void* computeMomentsArgs[] = {&cu.getPosq().getDevicePointer(), &multipoleParticles->getDevicePointer(),
        &molecularDipoles->getDevicePointer(), &molecularQuadrupoles->getDevicePointer(),
        &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer()};
-    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getPaddedNumAtoms());
+    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getNumAtoms());
-    vector<float> d, q;
-    labFrameDipoles->download(d);
-    labFrameQuadrupoles->download(q);
-    for (int i = 0; i < cu.getNumAtoms(); i++)
-        printf("%d %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2]);
-    for (int i = 0; i < cu.getNumAtoms(); i++)
-        printf("%d %g %g %g %g %g %g %g %g %g\n", i, q[9*i], q[9*i+1], q[9*i+2], q[9*i+3], q[9*i+4], q[9*i+5], q[9*i+6], q[9*i+7], q[9*i+8]);
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
    int forceThreadBlockSize = nb.getForceThreadBlockSize();
    if (pmeGrid == NULL) {
+        // Compute induced dipoles.
        void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-            &nb.getExclusions().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+            &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
-        vector<unsigned long long> f;
+        void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
-        field->download(f);
+            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()};
-        int pad = cu.getPaddedNumAtoms();
+        cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
-        for (int i = 0; i < cu.getNumAtoms(); i++) {
+        for (int i = 0; i < maxInducedIterations; i++) {
-            printf("%d %g %g %g\n", i, f[i]/(double)0xFFFFFFFF, f[i+pad]/(double)0xFFFFFFFF, f[i+pad*2]/(double)0xFFFFFFFF);
        }
+        // Compute electrostatic force.
+        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
+            &cu.getPosq().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
+            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
+            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
    return 0.0;
 }
@@ -1598,28 +1628,22 @@ void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const Amoeba
        throw OpenMMException("Illegal combining rule for sigma: "+sigmaCombiningRule);
    string epsilonCombiningRule = force.getEpsilonCombiningRule();
    if (epsilonCombiningRule == "ARITHMETIC")
-        replacements["EPILON_COMBINING_RULE"] = "1";
+        replacements["EPSILON_COMBINING_RULE"] = "1";
    else if (epsilonCombiningRule == "GEOMETRIC")
-        replacements["EPILON_COMBINING_RULE"] = "2";
+        replacements["EPSILON_COMBINING_RULE"] = "2";
    else if (epsilonCombiningRule == "HARMONIC")
-        replacements["EPILON_COMBINING_RULE"] = "3";
+        replacements["EPSILON_COMBINING_RULE"] = "3";
    else if (epsilonCombiningRule == "HHG")
-        replacements["EPILON_COMBINING_RULE"] = "4";
+        replacements["EPSILON_COMBINING_RULE"] = "4";
    else
        throw OpenMMException("Illegal combining rule for sigma: "+sigmaCombiningRule);
    double cutoff = force.getCutoff();
    double taperCutoff = cutoff*0.9;
    replacements["CUTOFF_DISTANCE"] = cu.doubleToString(force.getCutoff());
    replacements["TAPER_CUTOFF"] = cu.doubleToString(taperCutoff);
-    double cutoff2 = cutoff*cutoff;
+    replacements["TAPER_C3"] = cu.doubleToString(10/pow(taperCutoff-cutoff, 3.0));
-    double taperCutoff2 = taperCutoff*taperCutoff;
+    replacements["TAPER_C4"] = cu.doubleToString(15/pow(taperCutoff-cutoff, 4.0));
-    double denom = pow(cutoff-taperCutoff, -5.0);
+    replacements["TAPER_C5"] = cu.doubleToString(6/pow(taperCutoff-cutoff, 5.0));
-    replacements["TAPER_C0"] = cu.doubleToString(cutoff*cutoff2 * (cutoff2-5.0*cutoff*taperCutoff+10.0*taperCutoff2)*denom);
-    replacements["TAPER_C1"] = cu.doubleToString(-30.0*cutoff2*taperCutoff2*denom);
-    replacements["TAPER_C2"] = cu.doubleToString(30.0*(cutoff2*taperCutoff+cutoff*taperCutoff2)*denom);
-    replacements["TAPER_C3"] = cu.doubleToString(-10.0*(cutoff2+4.0*cutoff*taperCutoff+taperCutoff2)*denom);
-    replacements["TAPER_C4"] = cu.doubleToString(15.0*(cutoff+taperCutoff)*denom);
-    replacements["TAPER_C5"] = cu.doubleToString(-6.0*denom);
    nonbonded->addInteraction(force.getUseNeighborList(), force.getPBC(), true, force.getCutoff(), exclusions,
        cu.replaceStrings(CudaAmoebaKernelSources::amoebaVdwForce2, replacements), force.getForceGroup());

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
@@ -374,14 +374,14 @@ public:
 private:
    class ForceInfo;
    void initializeScaleFactors();
-    int numMultipoles;
+    int numMultipoles, maxInducedIterations;
+    double inducedEpsilon;
    bool hasInitializedScaleFactors;
    CudaContext& cu;
    System& system;
    std::vector<int3> covalentFlagValues;
    std::vector<int2> polarizationFlagValues;
    CudaArray* multipoleParticles;
-    CudaArray* torqueBufferIndices;
    CudaArray* molecularDipoles;
    CudaArray* molecularQuadrupoles;
    CudaArray* labFrameDipoles;
@@ -412,7 +412,7 @@ private:
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
    cufftHandle fft;
-    CUfunction computeMomentsKernel, computeFixedFieldKernel;
+    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, electrostaticsKernel;
 };
 /**

--- a/plugins/amoeba/platforms/cuda2/src/kernels/amoebaVdwForce2.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/amoebaVdwForce2.cu
@@ -15,13 +15,11 @@
    real sigma = 2*(sigmaEpsilon1.x*sigma1_2 + sigmaEpsilon2.x*sigma2_2)/(sigma1_2+sigma2_2);
 #endif
 #if EPSILON_COMBINING_RULE == 1
-    real epsilon = sigmaEpsilon1.y + sigmaEpsilon2.y;
+    real epsilon = 0.5f*(sigmaEpsilon1.y + sigmaEpsilon2.y);
 #elif EPSILON_COMBINING_RULE == 2
-    real epsilon = 2*SQRT(sigmaEpsilon1.y*sigmaEpsilon2.y);
+    real epsilon = SQRT(sigmaEpsilon1.y*sigmaEpsilon2.y);
 #elif EPSILON_COMBINING_RULE == 3
-    real epsilon1_2 = sigmaEpsilon1.x*sigmaEpsilon1.x;
+    real epsilon = 2*(sigmaEpsilon1.y*sigmaEpsilon2.y)/(sigmaEpsilon1.y+sigmaEpsilon2.y);
-    real epsilon2_2 = sigmaEpsilon2.x*sigmaEpsilon2.x;
-    real epsilon = 2*(sigmaEpsilon1.x*epsilon1_2 + sigmaEpsilon2.x*epsilon2_2)/(epsilon1_2+epsilon2_2);
 #else
    real epsilon_s = SQRT(sigmaEpsilon1.y) + SQRT(sigmaEpsilon2.y);
    real epsilon = 4*sigmaEpsilon1.y*sigmaEpsilon2.y/(epsilon_s*epsilon_s);
@@ -39,13 +37,14 @@
    real tmp = sigma7*invRho;
    real gTau = epsilon*tau7*r6*1.12f*tmp*tmp;
    real termEnergy = epsilon*sigma7*tau7*((sigma7*1.12f*invRho)-2.0f);
-    real deltaE = (-7.0f*(dTau*termEnergy+gTau))*invR;
+    real deltaE = -7.0f*(dTau*termEnergy+gTau);
    if (r > TAPER_CUTOFF) {
-        real taper = TAPER_C0+r*(TAPER_C1+r*(TAPER_C2+r*(TAPER_C3+r*(TAPER_C4+r*TAPER_C5))));
+        real x = r-TAPER_CUTOFF;
-        real dtaper = TAPER_C1+r*(2*TAPER_C2+r*(3*TAPER_C3+r*(4*TAPER_C4+r*5*TAPER_C5)));
+        real taper = 1+x*x*x*(TAPER_C3+x*(TAPER_C4+x*TAPER_C5));
+        real dtaper = x*x*(3*TAPER_C3+x*(4*TAPER_C4+x*5*TAPER_C5));
        deltaE = termEnergy*dtaper + deltaE*taper;
        termEnergy *= taper;
    }
    tempEnergy += (includeInteraction ? termEnergy : 0);
-    dEdR -= (includeInteraction ? deltaE : 0);
+    dEdR -= (includeInteraction ? deltaE*invR : 0);
 }
--- a/plugins/amoeba/platforms/cuda2/src/kernels/electrostaticPairForce.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/electrostaticPairForce.cu
+/**
+ * This defines three different closely related functions, depending on which constant (F1, T1, or T3) is defined.
+ */
+#if defined F1
+__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real& energy, real3& outputForce) {
+#elif defined T1
+__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
+#else
+__device__ void computeOneInteractionT3(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce) {
+#endif
+#ifdef F1
+    const float uScale = 1;
+    real ddsc3_0 = 0;
+    real ddsc3_1 = 0;
+    real ddsc3_2 = 0;
+    real ddsc5_0 = 0;
+    real ddsc5_1 = 0;
+    real ddsc5_2 = 0;
+    real ddsc7_0 = 0;
+    real ddsc7_1 = 0;
+    real ddsc7_2 = 0;
+#endif
+    real xr = atom2.posq.x - atom1.posq.x;
+    real yr = atom2.posq.y - atom1.posq.y;
+    real zr = atom2.posq.z - atom1.posq.z;
+    real r2 = xr*xr + yr*yr + zr*zr;
+    real r = SQRT(r2);
+    real rr1 = RECIP(r);
+    real rr2 = rr1*rr1;
+    real rr3 = rr1*rr2;
+    real rr5 = 3*rr3*rr2;
+    real rr7 = 5*rr5*rr2;
+    real rr9 = 7*rr7*rr2;
+#ifdef F1
+    real rr11 = 9*rr9*rr2;
+#endif
+    real scale3 = 1;
+    real scale5 = 1;
+    real scale7 = 1;
+    real pdamp = atom1.damp*atom2.damp;
+    if (pdamp != 0) {
+        real ratio = r/pdamp;
+        float pGamma = atom2.thole > atom1.thole ? atom1.thole : atom2.thole;
+        real damp = ratio*ratio*ratio*pGamma;
+        real dampExp = EXP(-damp);
+        real damp1 = damp + 1;
+        real damp2 = damp*damp;
+        scale3 = 1 - dampExp;
+        scale5 = 1 - damp1*dampExp;
+        scale7 = 1 - (damp1 + 0.6f*damp2)*dampExp;
+#ifdef F1
+        real factor = 3*damp*dampExp*rr2;
+        real factor7 = -0.2f + 0.6f*damp;
+        ddsc3_0 = factor*xr;
+        ddsc5_0 = ddsc3_0*damp;
+        ddsc7_0 = ddsc5_0*factor7;
+        ddsc3_1 = factor*yr;
+        ddsc5_1 = ddsc3_1*damp;
+        ddsc7_1 = ddsc5_1*factor7;
+        ddsc3_2 = factor*zr;
+        ddsc5_2 = ddsc3_2*damp;
+        ddsc7_2 = ddsc5_2*factor7;
+#endif
+    }
+#if defined F1
+    real scale3i = rr3*scale3*uScale;
+    real scale5i = rr5*scale5*uScale;
+#endif
+    real dsc3 = rr3*scale3*dScale;
+    real psc3 = rr3*scale3*pScale;
+    real dsc5 = rr5*scale5*dScale;
+    real psc5 = rr5*scale5*pScale;
+    real dsc7 = rr7*scale7*dScale;
+    real psc7 = rr7*scale7*pScale;
+    real atom2quadrupoleZZ = 1-atom2.quadrupoleXX-atom2.quadrupoleYY;
+    real qJr_0 = atom2.quadrupoleXX*xr + atom2.quadrupoleXY*yr + atom2.quadrupoleXZ*zr;
+    real qJr_1 = atom2.quadrupoleXY*xr + atom2.quadrupoleYY*yr + atom2.quadrupoleYZ*zr;
+    real qJr_2 = atom2.quadrupoleXZ*xr + atom2.quadrupoleYZ*yr + atom2quadrupoleZZ*zr;
+    real atom1quadrupoleZZ = 1-atom1.quadrupoleXX-atom1.quadrupoleYY;
+    real qIr_0 = atom1.quadrupoleXX*xr + atom1.quadrupoleXY*yr + atom1.quadrupoleXZ*zr;
+    real qIr_1 = atom1.quadrupoleXY*xr + atom1.quadrupoleYY*yr + atom1.quadrupoleYZ*zr;
+    real qIr_2 = atom1.quadrupoleXZ*xr + atom1.quadrupoleYZ*yr + atom1quadrupoleZZ*zr;
+#if defined F1
+    real sc2 = atom1.dipole.x*atom2.dipole.x + atom1.dipole.y*atom2.dipole.y + atom1.dipole.z*atom2.dipole.z;
+#endif
+#if defined F1 || defined T1
+    real sc4 = atom2.dipole.x*xr + atom2.dipole.y*yr + atom2.dipole.z*zr;
+    real sc6 = qJr_0*xr + qJr_1*yr + qJr_2*zr;
+#endif
+#if defined F1 || defined T3
+    real sc3 = atom1.dipole.x*xr + atom1.dipole.y*yr + atom1.dipole.z*zr;
+    real sc5 = qIr_0*xr + qIr_1*yr + qIr_2*zr;
+#endif
+#if defined F1
+    real sc7 = qIr_0*atom2.dipole.x + qIr_1*atom2.dipole.y + qIr_2*atom2.dipole.z;
+    real sc8 = qJr_0*atom1.dipole.x + qJr_1*atom1.dipole.y + qJr_2*atom1.dipole.z;
+    real sc9 = qIr_0*qJr_0 + qIr_1*qJr_1 + qIr_2*qJr_2;
+    real sc10 = atom1.quadrupoleXX*atom2.quadrupoleXX + atom1.quadrupoleXY*atom2.quadrupoleXY + atom1.quadrupoleXZ*atom2.quadrupoleXZ +
+                atom1.quadrupoleXY*atom2.quadrupoleXY + atom1.quadrupoleYY*atom2.quadrupoleYY + atom1.quadrupoleYZ*atom2.quadrupoleYZ +
+                atom1.quadrupoleXZ*atom2.quadrupoleXZ + atom1.quadrupoleYZ*atom2.quadrupoleYZ + atom1quadrupoleZZ*atom2quadrupoleZZ;
+    real sci1 = atom1.inducedDipole.x*atom2.dipole.x + atom1.inducedDipole.y*atom2.dipole.y + atom1.inducedDipole.z*atom2.dipole.z +
+                atom2.inducedDipole.x*atom1.dipole.x + atom2.inducedDipole.y*atom1.dipole.y + atom2.inducedDipole.z*atom1.dipole.z;
+#endif
+#if defined F1 || defined T3
+    real sci3 = atom1.inducedDipole.x*xr + atom1.inducedDipole.y*yr + atom1.inducedDipole.z*zr;
+#endif
+#if defined F1
+    real sci7 = qIr_0*atom2.inducedDipole.x + qIr_1*atom2.inducedDipole.y + qIr_2*atom2.inducedDipole.z;
+    real sci8 = qJr_0*atom1.inducedDipole.x + qJr_1*atom1.inducedDipole.y + qJr_2*atom1.inducedDipole.z;
+#endif
+#if defined F1 || defined T1
+    real sci4 = atom2.inducedDipole.x*xr + atom2.inducedDipole.y*yr + atom2.inducedDipole.z*zr;
+#endif
+#if defined F1
+    real scip1 = atom1.inducedDipolePolar.x*atom2.dipole.x + atom1.inducedDipolePolar.y*atom2.dipole.y + atom1.inducedDipolePolar.z*atom2.dipole.z +
+                 atom2.inducedDipolePolar.x*atom1.dipole.x + atom2.inducedDipolePolar.y*atom1.dipole.y + atom2.inducedDipolePolar.z*atom1.dipole.z;
+    real scip2 = atom1.inducedDipole.x*atom2.inducedDipolePolar.x + atom1.inducedDipole.y*atom2.inducedDipolePolar.y + atom1.inducedDipole.z*atom2.inducedDipolePolar.z +
+                 atom2.inducedDipole.x*atom1.inducedDipolePolar.x + atom2.inducedDipole.y*atom1.inducedDipolePolar.y + atom2.inducedDipole.z*atom1.inducedDipolePolar.z;
+#endif
+#if defined F1 || defined T3
+    real scip3 = ((atom1.inducedDipolePolar.x)*(xr) + (atom1.inducedDipolePolar.y)*(yr) + (atom1.inducedDipolePolar.z)*(zr));
+#endif
+#if defined F1 || defined T1
+    real scip4 = ((atom2.inducedDipolePolar.x)*(xr) + (atom2.inducedDipolePolar.y)*(yr) + (atom2.inducedDipolePolar.z)*(zr));
+#endif
+#ifdef F1
+    real scip7 = ((qIr_0)*(atom2.inducedDipolePolar.x) + (qIr_1)*(atom2.inducedDipolePolar.y) + (qIr_2)*(atom2.inducedDipolePolar.z));
+    real scip8 = ((qJr_0)*(atom1.inducedDipolePolar.x) + (qJr_1)*(atom1.inducedDipolePolar.y) + (qJr_2)*(atom1.inducedDipolePolar.z));
+    real gli1 = atom2.posq.w*sci3 - atom1.posq.w*sci4;
+    real gli6 = sci1;
+    real glip1 = atom2.posq.w*scip3 - atom1.posq.w*scip4;
+    real glip6 = scip1;
+    real gli2 = -sc3*sci4 - sci3*sc4;
+    real gli3 = sci3*sc6 - sci4*sc5;
+    real gli7 = 2*(sci7-sci8);
+    real glip2 = -sc3*scip4 - scip3*sc4;
+    real glip3 = scip3*sc6 - scip4*sc5;
+    real glip7 = 2*(scip7-scip8);
+    real factor3 = rr3*((gli1  +  gli6)*pScale + (glip1  + glip6)*dScale);
+    real factor5 = rr5*((gli2  +  gli7)*pScale + (glip2  + glip7)*dScale);
+    real factor7 = rr7*(gli3*pScale + glip3*dScale);
+    real ftm2i_0 = -0.5f*(factor3*ddsc3_0 + factor5*ddsc5_0 + factor7*ddsc7_0);
+    real ftm2i_1 = -0.5f*(factor3*ddsc3_1 + factor5*ddsc5_1 + factor7*ddsc7_1);
+    real ftm2i_2 = -0.5f*(factor3*ddsc3_2 + factor5*ddsc5_2 + factor7*ddsc7_2);
+    real gl0 = atom1.posq.w*atom2.posq.w;
+    real gl1 = atom2.posq.w*sc3 - atom1.posq.w*sc4;
+    real gl2 = atom1.posq.w*sc6 + atom2.posq.w*sc5 - sc3*sc4;
+    real gl3 = sc3*sc6 - sc4*sc5;
+    real gl4 = sc5*sc6;
+    real gl6 = sc2;
+    real gl7 = 2*(sc7-sc8);
+    real gl8 = 2*sc10;
+    real gl5 = -4*sc9;
+    real gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4;
+#endif
+#if defined F1 || defined T1
+    real gf2 = -atom2.posq.w*rr3 + sc4*rr5 - sc6*rr7;
+    real gf5 = 2*(-atom2.posq.w*rr5+sc4*rr7-sc6*rr9);
+#endif
+#if defined F1 || defined T3
+    real gf3 =  atom1.posq.w*rr3 + sc3*rr5 + sc5*rr7;
+    real gf6 = 2*(-atom1.posq.w*rr5-sc3*rr7-sc5*rr9);
+#endif
+#ifdef F1
+    real em = mScale*(rr1*gl0 + rr3*(gl1+gl6) + rr5*(gl2+gl7+gl8) + rr7*(gl3+gl5) + rr9*gl4);
+    real ei = 0.5f*((gli1+gli6)*psc3 + (gli2+gli7)*psc5 + gli3*psc7);
+    energy = em+ei;
+#endif
+#if defined F1 || defined T1
+    real qIdJ_0 = atom1.quadrupoleXX*atom2.dipole.x + atom1.quadrupoleXY*atom2.dipole.y + atom1.quadrupoleXZ*atom2.dipole.z;
+    real qIdJ_1 = atom1.quadrupoleXY*atom2.dipole.x + atom1.quadrupoleYY*atom2.dipole.y + atom1.quadrupoleYZ*atom2.dipole.z;
+    real qIdJ_2 = atom1.quadrupoleXZ*atom2.dipole.x + atom1.quadrupoleYZ*atom2.dipole.y + atom1quadrupoleZZ*atom2.dipole.z;
+    real qIqJr_0 = atom1.quadrupoleXX*qJr_0 + atom1.quadrupoleXY*qJr_1 + atom1.quadrupoleXZ*qJr_2;
+    real qIqJr_1 = atom1.quadrupoleXY*qJr_0 + atom1.quadrupoleYY*qJr_1 + atom1.quadrupoleYZ*qJr_2;
+    real qIqJr_2 = atom1.quadrupoleXZ*qJr_0 + atom1.quadrupoleYZ*qJr_1 + atom1quadrupoleZZ*qJr_2;
+#endif
+#ifdef F1
+    real qkqir_0 = atom2.quadrupoleXX*qIr_0 + atom2.quadrupoleXY*qIr_1 + atom2.quadrupoleXZ*qIr_2;
+    real qkqir_1 = atom2.quadrupoleXY*qIr_0 + atom2.quadrupoleYY*qIr_1 + atom2.quadrupoleYZ*qIr_2;
+    real qkqir_2 = atom2.quadrupoleXZ*qIr_0 + atom2.quadrupoleYZ*qIr_1 + atom2quadrupoleZZ*qIr_2;
+    real qkdi_0 = atom2.quadrupoleXX*atom1.dipole.x + atom2.quadrupoleXY*atom1.dipole.y + atom2.quadrupoleXZ*atom1.dipole.z;
+    real qkdi_1 = atom2.quadrupoleXY*atom1.dipole.x + atom2.quadrupoleYY*atom1.dipole.y + atom2.quadrupoleYZ*atom1.dipole.z;
+    real qkdi_2 = atom2.quadrupoleXZ*atom1.dipole.x + atom2.quadrupoleYZ*atom1.dipole.y + atom2quadrupoleZZ*atom1.dipole.z;
+    real ftm2_0 = mScale*(gf1*xr + gf2*atom1.dipole.x + gf3*atom2.dipole.x + 2*rr5*(qkdi_0 - qIdJ_0) + gf5*qIr_0 + gf6*qJr_0 + 4*rr7*(qIqJr_0 + qkqir_0));
+    real ftm2_1 = mScale*(gf1*yr + gf2*atom1.dipole.y + gf3*atom2.dipole.y + 2*rr5*(qkdi_1 - qIdJ_1) + gf5*qIr_1 + gf6*qJr_1 + 4*rr7*(qIqJr_1 + qkqir_1));
+    real ftm2_2 = mScale*(gf1*zr + gf2*atom1.dipole.z + gf3*atom2.dipole.z + 2*rr5*(qkdi_2 - qIdJ_2) + gf5*qIr_2 + gf6*qJr_2 + 4*rr7*(qIqJr_2 + qkqir_2));
+    real gfi1 = rr2*(1.5f*((gli1+gli6)*psc3 + (glip1+glip6)*dsc3 + scip2*scale3i) + 2.5f*((gli7+gli2)*psc5 + (glip7+glip2)*dsc5 - (sci3*scip4+scip3*sci4)*scale5i) + 3.5f*(gli3*psc7+glip3*dsc7));
+    ftm2i_0 += gfi1*xr;
+    ftm2i_1 += gfi1*yr;
+    ftm2i_2 += gfi1*zr;
+#endif
+#if defined F1 || defined T1
+    real gfi5 = (sci4*psc7 + scip4*dsc7);
+#endif
+#if defined F1 || defined T3
+    real gfi6 = -(sci3*psc7 + scip3*dsc7);
+#endif
+#if defined F1 || defined T1
+    real qIuJ_0 = atom1.quadrupoleXX*atom2.inducedDipole.x   + atom1.quadrupoleXY*atom2.inducedDipole.y  + atom1.quadrupoleXZ*atom2.inducedDipole.z;
+    real qIuJ_1 = atom1.quadrupoleXY*atom2.inducedDipole.x   + atom1.quadrupoleYY*atom2.inducedDipole.y  + atom1.quadrupoleYZ*atom2.inducedDipole.z;
+    real qIuJ_2 = atom1.quadrupoleXZ*atom2.inducedDipole.x   + atom1.quadrupoleYZ*atom2.inducedDipole.y  + atom1quadrupoleZZ*atom2.inducedDipole.z;
+    real qIuJp_0 = atom1.quadrupoleXX*atom2.inducedDipolePolar.x + atom1.quadrupoleXY*atom2.inducedDipolePolar.y + atom1.quadrupoleXZ*atom2.inducedDipolePolar.z;
+    real qIuJp_1 = atom1.quadrupoleXY*atom2.inducedDipolePolar.x + atom1.quadrupoleYY*atom2.inducedDipolePolar.y + atom1.quadrupoleYZ*atom2.inducedDipolePolar.z;
+    real qIuJp_2 = atom1.quadrupoleXZ*atom2.inducedDipolePolar.x + atom1.quadrupoleYZ*atom2.inducedDipolePolar.y + atom1quadrupoleZZ*atom2.inducedDipolePolar.z;
+#endif
+#if defined T3
+    real qJuIp_0 = atom2.quadrupoleXX*atom1.inducedDipolePolar.x + atom2.quadrupoleXY*atom1.inducedDipolePolar.y + atom2.quadrupoleXZ*atom1.inducedDipolePolar.z;
+    real qJuIp_1 = atom2.quadrupoleXY*atom1.inducedDipolePolar.x + atom2.quadrupoleYY*atom1.inducedDipolePolar.y + atom2.quadrupoleYZ*atom1.inducedDipolePolar.z;
+    real qJuIp_2 = atom2.quadrupoleXZ*atom1.inducedDipolePolar.x + atom2.quadrupoleYZ*atom1.inducedDipolePolar.y + atom2quadrupoleZZ*atom1.inducedDipolePolar.z;
+     real qJuI_0 = atom2.quadrupoleXX*atom1.inducedDipole.x + atom2.quadrupoleXY*atom1.inducedDipole.y + atom2.quadrupoleXZ*atom1.inducedDipole.z;
+     real qJuI_1 = atom2.quadrupoleXY*atom1.inducedDipole.x + atom2.quadrupoleYY*atom1.inducedDipole.y + atom2.quadrupoleYZ*atom1.inducedDipole.z;
+     real qJuI_2 = atom2.quadrupoleXZ*atom1.inducedDipole.x + atom2.quadrupoleYZ*atom1.inducedDipole.y + atom2quadrupoleZZ*atom1.inducedDipole.z;
+#endif
+#ifdef F1
+    real qkui_0 = atom2.quadrupoleXX*atom1.inducedDipole.x + atom2.quadrupoleXY*atom1.inducedDipole.y + atom2.quadrupoleXZ*atom1.inducedDipole.z;
+    real qkui_1 = atom2.quadrupoleXY*atom1.inducedDipole.x + atom2.quadrupoleYY*atom1.inducedDipole.y + atom2.quadrupoleYZ*atom1.inducedDipole.z;
+    real qkui_2 = atom2.quadrupoleXZ*atom1.inducedDipole.x + atom2.quadrupoleYZ*atom1.inducedDipole.y + atom2quadrupoleZZ*atom1.inducedDipole.z;
+    real qkuip_0 = atom2.quadrupoleXX*atom1.inducedDipolePolar.x + atom2.quadrupoleXY*atom1.inducedDipolePolar.y + atom2.quadrupoleXZ*atom1.inducedDipolePolar.z;
+    real qkuip_1 = atom2.quadrupoleXY*atom1.inducedDipolePolar.x + atom2.quadrupoleYY*atom1.inducedDipolePolar.y + atom2.quadrupoleYZ*atom1.inducedDipolePolar.z;
+    real qkuip_2 = atom2.quadrupoleXZ*atom1.inducedDipolePolar.x + atom2.quadrupoleYZ*atom1.inducedDipolePolar.y + atom2quadrupoleZZ*atom1.inducedDipolePolar.z;
+    ftm2i_0 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.x*psc3 + atom1.inducedDipolePolar.x*dsc3) +
+               sc4*(atom1.inducedDipole.x*psc5 + atom1.inducedDipolePolar.x*dsc5) -
+               sc6*(atom1.inducedDipole.x*psc7 + atom1.inducedDipolePolar.x*dsc7)) +
+               0.5f*(atom1.posq.w*(atom2.inducedDipole.x*psc3+atom2.inducedDipolePolar.x*dsc3) +
+               sc3*(atom2.inducedDipole.x*psc5 +atom2.inducedDipolePolar.x*dsc5) +
+               sc5*(atom2.inducedDipole.x*psc7 +atom2.inducedDipolePolar.x*dsc7)) +
+               scale5i*(sci4*atom1.inducedDipolePolar.x+scip4*atom1.inducedDipole.x +
+                        sci3*atom2.inducedDipolePolar.x+scip3*atom2.inducedDipole.x)*0.5f +
+               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.x +
+               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.x +
+               ((qkui_0-qIuJ_0)*psc5 + (qkuip_0-qIuJp_0)*dsc5) +
+               gfi5*qIr_0 + gfi6*qJr_0;
+    ftm2i_1 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.y*psc3 + atom1.inducedDipolePolar.y*dsc3) +
+               sc4*(atom1.inducedDipole.y*psc5 + atom1.inducedDipolePolar.y*dsc5) -
+               sc6*(atom1.inducedDipole.y*psc7 + atom1.inducedDipolePolar.y*dsc7)) +
+               (atom1.posq.w*(atom2.inducedDipole.y*psc3+atom2.inducedDipolePolar.y*dsc3) +
+                    sc3*(atom2.inducedDipole.y*psc5+atom2.inducedDipolePolar.y*dsc5) +
+                    sc5*(atom2.inducedDipole.y*psc7+atom2.inducedDipolePolar.y*dsc7))*0.5f +
+                    scale5i*(sci4*atom1.inducedDipolePolar.y+scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y+scip3*atom2.inducedDipole.y)*0.5f +
+               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.y +
+               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.y +
+               ((qkui_1-qIuJ_1)*psc5 + (qkuip_1-qIuJp_1)*dsc5) +
+               gfi5*qIr_1 + gfi6*qJr_1;
+    ftm2i_2 += 0.5f*(-atom2.posq.w*(atom1.inducedDipole.z*psc3 + atom1.inducedDipolePolar.z*dsc3) +
+               sc4*(atom1.inducedDipole.z*psc5 + atom1.inducedDipolePolar.z*dsc5) -
+               sc6*(atom1.inducedDipole.z*psc7 + atom1.inducedDipolePolar.z*dsc7)) +
+               (atom1.posq.w*(atom2.inducedDipole.z*psc3+atom2.inducedDipolePolar.z*dsc3) +
+                    sc3*(atom2.inducedDipole.z*psc5+atom2.inducedDipolePolar.z*dsc5) +
+                    sc5*(atom2.inducedDipole.z*psc7+atom2.inducedDipolePolar.z*dsc7))*0.5f +
+                    scale5i*(sci4*atom1.inducedDipolePolar.z+scip4*atom1.inducedDipole.z +
+                    sci3*atom2.inducedDipolePolar.z+scip3*atom2.inducedDipole.z)*0.5f +
+               0.5f*(sci4*psc5+scip4*dsc5)*atom1.dipole.z +
+               0.5f*(sci3*psc5+scip3*dsc5)*atom2.dipole.z +
+               ((qkui_2-qIuJ_2)*psc5 + (qkuip_2-qIuJp_2)*dsc5) +
+               gfi5*qIr_2 + gfi6*qJr_2;
+#ifdef DIRECT_POLARIZATION
+    real gfd = 0.5*(3*rr2*scip2*scale3i - 5*rr2*(scip3*sci4+sci3*scip4)*scale5i);
+    real temp5 = 0.5*scale5i;
+    real fdir_0 = gfd*xr + temp5*(sci4*atom1.inducedDipolePolar.x + scip4*atom1.inducedDipole.x + sci3*atom2.inducedDipolePolar.x + scip3*atom2.inducedDipole.x);
+    real fdir_1 = gfd*yr + temp5*(sci4*atom1.inducedDipolePolar.y + scip4*atom1.inducedDipole.y + sci3*atom2.inducedDipolePolar.y + scip3*atom2.inducedDipole.y);
+    real fdir_2 = gfd*zr + temp5*(sci4*atom1.inducedDipolePolar.z + scip4*atom1.inducedDipole.z + sci3*atom2.inducedDipolePolar.z + scip3*atom2.inducedDipole.z);
+    ftm2i_0 -= fdir_0;
+    ftm2i_1 -= fdir_1;
+    ftm2i_2 -= fdir_2;
+#else
+    real scaleF = 0.5f*uScale;
+    real inducedFactor3 = scip2*rr3*scaleF;
+    real inducedFactor5 = (sci3*scip4+scip3*sci4)*rr5*scaleF;
+    real findmp_0 = inducedFactor3*ddsc3_0 - inducedFactor5*ddsc5_0;
+    real findmp_1 = inducedFactor3*ddsc3_1 - inducedFactor5*ddsc5_1;
+    real findmp_2 = inducedFactor3*ddsc3_2 - inducedFactor5*ddsc5_2;
+    ftm2i_0 -= findmp_0;
+    ftm2i_1 -= findmp_1;
+    ftm2i_2 -= findmp_2;
+#endif
+#endif
+#if defined T1
+    real gti2 = 0.5f*(sci4*psc5+scip4*dsc5);
+    real gti5 = gfi5;
+#endif
+#if defined T3
+    real gti3 = 0.5f*(sci3*psc5+scip3*dsc5);
+    real gti6 = gfi6;
+#endif
+#if defined T1 || defined T3
+    real dixdk_0 = atom1.dipole.y*atom2.dipole.z - atom1.dipole.z*atom2.dipole.y;
+    real dixdk_1 = atom1.dipole.z*atom2.dipole.x - atom1.dipole.x*atom2.dipole.z;
+    real dixdk_2 = atom1.dipole.x*atom2.dipole.y - atom1.dipole.y*atom2.dipole.x;
+#if defined T1
+    real dixuk_0 = atom1.dipole.y*atom2.inducedDipole.z - atom1.dipole.z*atom2.inducedDipole.y;
+    real dixuk_1 = atom1.dipole.z*atom2.inducedDipole.x - atom1.dipole.x*atom2.inducedDipole.z;
+    real dixuk_2 = atom1.dipole.x*atom2.inducedDipole.y - atom1.dipole.y*atom2.inducedDipole.x;
+#endif
+#endif
+#ifdef T1
+    real dixukp_0 = atom1.dipole.y*atom2.inducedDipolePolar.z - atom1.dipole.z*atom2.inducedDipolePolar.y;
+    real dixukp_1 = atom1.dipole.z*atom2.inducedDipolePolar.x - atom1.dipole.x*atom2.inducedDipolePolar.z;
+    real dixukp_2 = atom1.dipole.x*atom2.inducedDipolePolar.y - atom1.dipole.y*atom2.inducedDipolePolar.x;
+#endif
+#ifdef T1
+    real dixr_0 = atom1.dipole.y*zr - atom1.dipole.z*yr;
+    real dixr_1 = atom1.dipole.z*xr - atom1.dipole.x*zr;
+    real dixr_2 = atom1.dipole.x*yr - atom1.dipole.y*xr;
+#endif
+#ifdef T1
+    real rxqiukp_0 = yr*qIuJp_2 - zr*qIuJp_1;
+    real rxqiukp_1 = zr*qIuJp_0 - xr*qIuJp_2;
+    real rxqiukp_2 = xr*qIuJp_1 - yr*qIuJp_0;
+    real rxqir_0 = yr*qIr_2 - zr*qIr_1;
+    real rxqir_1 = zr*qIr_0 - xr*qIr_2;
+    real rxqir_2 = xr*qIr_1 - yr*qIr_0;
+    real rxqiuk_0 = yr*qIuJ_2 - zr*qIuJ_1;
+    real rxqiuk_1 = zr*qIuJ_0 - xr*qIuJ_2;
+    real rxqiuk_2 = xr*qIuJ_1 - yr*qIuJ_0;
+    real ukxqir_0 = atom2.inducedDipole.y*qIr_2 - atom2.inducedDipole.z*qIr_1;
+    real ukxqir_1 = atom2.inducedDipole.z*qIr_0 - atom2.inducedDipole.x*qIr_2;
+    real ukxqir_2 = atom2.inducedDipole.x*qIr_1 - atom2.inducedDipole.y*qIr_0;
+    real ukxqirp_0 = atom2.inducedDipolePolar.y*qIr_2 - atom2.inducedDipolePolar.z*qIr_1;
+    real ukxqirp_1 = atom2.inducedDipolePolar.z*qIr_0 - atom2.inducedDipolePolar.x*qIr_2;
+    real ukxqirp_2 = atom2.inducedDipolePolar.x*qIr_1 - atom2.inducedDipolePolar.y*qIr_0;
+    real dixqkr_0 = atom1.dipole.y*qJr_2 - atom1.dipole.z*qJr_1;
+    real dixqkr_1 = atom1.dipole.z*qJr_0 - atom1.dipole.x*qJr_2;
+    real dixqkr_2 = atom1.dipole.x*qJr_1 - atom1.dipole.y*qJr_0;
+    real dkxqir_0 = atom2.dipole.y*qIr_2 - atom2.dipole.z*qIr_1;
+    real dkxqir_1 = atom2.dipole.z*qIr_0 - atom2.dipole.x*qIr_2;
+    real dkxqir_2 = atom2.dipole.x*qIr_1 - atom2.dipole.y*qIr_0;
+    real rxqikr_0 = yr*qIqJr_2 - zr*qIqJr_1;
+    real rxqikr_1 = zr*qIqJr_0 - xr*qIqJr_2;
+    real rxqikr_2 = xr*qIqJr_1 - yr*qIqJr_0;
+    real rxqidk_0 = yr*qIdJ_2 - zr*qIdJ_1;
+    real rxqidk_1 = zr*qIdJ_0 - xr*qIdJ_2;
+    real rxqidk_2 = xr*qIdJ_1 - yr*qIdJ_0;
+    real qkrxqir_0 = qJr_1*qIr_2 - qJr_2*qIr_1;
+    real qkrxqir_1 = qJr_2*qIr_0 - qJr_0*qIr_2;
+    real qkrxqir_2 = qJr_0*qIr_1 - qJr_1*qIr_0;
+#endif
+#if defined T1 || defined T3
+    real qixqk_0 = atom1.quadrupoleXY*atom2.quadrupoleXZ + atom1.quadrupoleYY*atom2.quadrupoleYZ + atom1.quadrupoleYZ*atom2quadrupoleZZ -
+                   atom1.quadrupoleXZ*atom2.quadrupoleXY - atom1.quadrupoleYZ*atom2.quadrupoleYY - atom1quadrupoleZZ*atom2.quadrupoleYZ;
+    real qixqk_1 = atom1.quadrupoleXZ*atom2.quadrupoleXX + atom1.quadrupoleYZ*atom2.quadrupoleXY + atom1quadrupoleZZ*atom2.quadrupoleXZ -
+                   atom1.quadrupoleXX*atom2.quadrupoleXZ - atom1.quadrupoleXY*atom2.quadrupoleYZ - atom1.quadrupoleXZ*atom2quadrupoleZZ;
+    real qixqk_2 = atom1.quadrupoleXX*atom2.quadrupoleXY + atom1.quadrupoleXY*atom2.quadrupoleYY + atom1.quadrupoleXZ*atom2.quadrupoleYZ -
+                   atom1.quadrupoleXY*atom2.quadrupoleXX - atom1.quadrupoleYY*atom2.quadrupoleXY - atom1.quadrupoleYZ*atom2.quadrupoleXZ;
+#endif
+#ifdef T1
+    real ttm2_0 = -rr3*dixdk_0 + gf2*dixr_0-gf5*rxqir_0 + 2*rr5*(dixqkr_0 + dkxqir_0 + rxqidk_0-2*qixqk_0) - 4*rr7*(rxqikr_0 + qkrxqir_0);
+    real ttm2_1 = -rr3*dixdk_1 + gf2*dixr_1-gf5*rxqir_1 + 2*rr5*(dixqkr_1 + dkxqir_1 + rxqidk_1-2*qixqk_1) - 4*rr7*(rxqikr_1 + qkrxqir_1);
+    real ttm2_2 = -rr3*dixdk_2 + gf2*dixr_2-gf5*rxqir_2 + 2*rr5*(dixqkr_2 + dkxqir_2 + rxqidk_2-2*qixqk_2) - 4*rr7*(rxqikr_2 + qkrxqir_2);
+    real ttm2i_0 = -(dixuk_0*psc3+dixukp_0*dsc3)*0.5f + gti2*dixr_0 + ((ukxqir_0+ rxqiuk_0)*psc5 + (ukxqirp_0 + rxqiukp_0)*dsc5) - gti5*rxqir_0;
+    real ttm2i_1 = -(dixuk_1*psc3+dixukp_1*dsc3)*0.5f + gti2*dixr_1 + ((ukxqir_1+ rxqiuk_1)*psc5 + (ukxqirp_1 + rxqiukp_1)*dsc5) - gti5*rxqir_1;
+    real ttm2i_2 = -(dixuk_2*psc3+dixukp_2*dsc3)*0.5f + gti2*dixr_2 + ((ukxqir_2+ rxqiuk_2)*psc5 + (ukxqirp_2 + rxqiukp_2)*dsc5) - gti5*rxqir_2;
+#endif
+#ifdef T3
+    real qJqIr_0 = atom2.quadrupoleXX*qIr_0 + atom2.quadrupoleXY*qIr_1 + atom2.quadrupoleXZ*qIr_2;
+    real qJqIr_1 = atom2.quadrupoleXY*qIr_0 + atom2.quadrupoleYY*qIr_1 + atom2.quadrupoleYZ*qIr_2;
+    real qJqIr_2 = atom2.quadrupoleXZ*qIr_0 + atom2.quadrupoleYZ*qIr_1 + atom2quadrupoleZZ*qIr_2;
+    real qJdI_0 = atom2.quadrupoleXX*atom1.dipole.x + atom2.quadrupoleXY*atom1.dipole.y + atom2.quadrupoleXZ*atom1.dipole.z;
+    real qJdI_1 = atom2.quadrupoleXY*atom1.dipole.x + atom2.quadrupoleYY*atom1.dipole.y + atom2.quadrupoleYZ*atom1.dipole.z;
+    real qJdI_2 = atom2.quadrupoleXZ*atom1.dipole.x + atom2.quadrupoleYZ*atom1.dipole.y + atom2quadrupoleZZ*atom1.dipole.z;
+    real dkxr_0 = atom2.dipole.y*zr - atom2.dipole.z*yr;
+    real dkxr_1 = atom2.dipole.z*xr - atom2.dipole.x*zr;
+    real dkxr_2 = atom2.dipole.x*yr - atom2.dipole.y*xr;
+    real rxqkr_0 = yr*qJr_2 - zr*qJr_1;
+    real rxqkr_1 = zr*qJr_0 - xr*qJr_2;
+    real rxqkr_2 = xr*qJr_1 - yr*qJr_0;
+    real dixqkr_0 = atom1.dipole.y*qJr_2 - atom1.dipole.z*qJr_1;
+    real dixqkr_1 = atom1.dipole.z*qJr_0 - atom1.dipole.x*qJr_2;
+    real dixqkr_2 = atom1.dipole.x*qJr_1 - atom1.dipole.y*qJr_0;
+    real dkxqir_0 = atom2.dipole.y*qIr_2 - atom2.dipole.z*qIr_1;
+    real dkxqir_1 = atom2.dipole.z*qIr_0 - atom2.dipole.x*qIr_2;
+    real dkxqir_2 = atom2.dipole.x*qIr_1 - atom2.dipole.y*qIr_0;
+    real rxqkdi_0 = yr*qJdI_2 - zr*qJdI_1;
+    real rxqkdi_1 = zr*qJdI_0 - xr*qJdI_2;
+    real rxqkdi_2 = xr*qJdI_1 - yr*qJdI_0;
+    real rxqkir_0 = yr*qJqIr_2 - zr*qJqIr_1;
+    real rxqkir_1 = zr*qJqIr_0 - xr*qJqIr_2;
+    real rxqkir_2 = xr*qJqIr_1 - yr*qJqIr_0;
+    real qkrxqir_0 = qJr_1*qIr_2 - qJr_2*qIr_1;
+    real qkrxqir_1 = qJr_2*qIr_0 - qJr_0*qIr_2;
+    real qkrxqir_2 = qJr_0*qIr_1 - qJr_1*qIr_0;
+    real dkxui_0 = atom2.dipole.y*atom1.inducedDipole.z - atom2.dipole.z*atom1.inducedDipole.y;
+    real dkxui_1 = atom2.dipole.z*atom1.inducedDipole.x - atom2.dipole.x*atom1.inducedDipole.z; 
+    real dkxui_2 = atom2.dipole.x*atom1.inducedDipole.y - atom2.dipole.y*atom1.inducedDipole.x;
+    real dkxuip_0 = atom2.dipole.y*atom1.inducedDipolePolar.z - atom2.dipole.z*atom1.inducedDipolePolar.y;
+    real dkxuip_1 = atom2.dipole.z*atom1.inducedDipolePolar.x - atom2.dipole.x*atom1.inducedDipolePolar.z;
+    real dkxuip_2 = atom2.dipole.x*atom1.inducedDipolePolar.y - atom2.dipole.y*atom1.inducedDipolePolar.x;
+    real uixqkrp_0 = atom1.inducedDipolePolar.y*qJr_2 - atom1.inducedDipolePolar.z*qJr_1;
+    real uixqkrp_1 = atom1.inducedDipolePolar.z*qJr_0 - atom1.inducedDipolePolar.x*qJr_2;
+    real uixqkrp_2 = atom1.inducedDipolePolar.x*qJr_1 - atom1.inducedDipolePolar.y*qJr_0;
+    real uixqkr_0 = atom1.inducedDipole.y*qJr_2 - atom1.inducedDipole.z*qJr_1;
+    real uixqkr_1 = atom1.inducedDipole.z*qJr_0 - atom1.inducedDipole.x*qJr_2;
+    real uixqkr_2 = atom1.inducedDipole.x*qJr_1 - atom1.inducedDipole.y*qJr_0;
+    real rxqkuip_0 = yr*qJuIp_2 - zr*qJuIp_1;
+    real rxqkuip_1 = zr*qJuIp_0 - xr*qJuIp_2;
+    real rxqkuip_2 = xr*qJuIp_1 - yr*qJuIp_0;
+    real rxqkui_0 = yr*qJuI_2 - zr*qJuI_1;
+    real rxqkui_1 = zr*qJuI_0 - xr*qJuI_2;
+    real rxqkui_2 = xr*qJuI_1 - yr*qJuI_0;
+    real ttm3_0 =  rr3*dixdk_0 + gf3*dkxr_0 - gf6*rxqkr_0 - 2*rr5*(dixqkr_0 + dkxqir_0 + rxqkdi_0 - 2*qixqk_0) - 4*rr7*(rxqkir_0 - qkrxqir_0);
+    real ttm3_1 =  rr3*dixdk_1 + gf3*dkxr_1 - gf6*rxqkr_1 - 2*rr5*(dixqkr_1 + dkxqir_1 + rxqkdi_1 - 2*qixqk_1) - 4*rr7*(rxqkir_1 - qkrxqir_1);
+    real ttm3_2 =  rr3*dixdk_2 + gf3*dkxr_2 - gf6*rxqkr_2 - 2*rr5*(dixqkr_2 + dkxqir_2 + rxqkdi_2 - 2*qixqk_2) - 4*rr7*(rxqkir_2 - qkrxqir_2);
+    real ttm3i_0 = -(dkxui_0*psc3+ dkxuip_0*dsc3)*0.5f + gti3*dkxr_0 - ((uixqkr_0 + rxqkui_0)*psc5 + (uixqkrp_0 + rxqkuip_0)*dsc5) - gti6*rxqkr_0;
+    real ttm3i_1 = -(dkxui_1*psc3+ dkxuip_1*dsc3)*0.5f + gti3*dkxr_1 - ((uixqkr_1 + rxqkui_1)*psc5 + (uixqkrp_1 + rxqkuip_1)*dsc5) - gti6*rxqkr_1;
+    real ttm3i_2 = -(dkxui_2*psc3+ dkxuip_2*dsc3)*0.5f + gti3*dkxr_2 - ((uixqkr_2 + rxqkui_2)*psc5 + (uixqkrp_2 + rxqkuip_2)*dsc5) - gti6*rxqkr_2;
+#endif
+    if (mScale < 1) {
+#ifdef T1
+        ttm2_0 *= mScale;
+        ttm2_1 *= mScale;
+        ttm2_2 *= mScale;
+#endif
+#ifdef T3
+        ttm3_0 *= mScale;
+        ttm3_1 *= mScale;
+        ttm3_2 *= mScale;
+#endif
+    }
+#ifdef F1
+    outputForce.x = -(ftm2_0+ftm2i_0);
+    outputForce.y = -(ftm2_1+ftm2i_1);
+    outputForce.z = -(ftm2_2+ftm2i_2);
+#endif
+#ifdef T1
+    outputForce.x = (ttm2_0 + ttm2i_0);
+    outputForce.y = (ttm2_1 + ttm2i_1);
+    outputForce.z = (ttm2_2 + ttm2i_2);
+#endif
+#ifdef T3
+    outputForce.x = (ttm3_0 + ttm3i_0);
+    outputForce.y = (ttm3_1 + ttm3i_1);
+    outputForce.z = (ttm3_2 + ttm3i_2);
+#endif
+}
--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleElectrostatics.cu
+#define TILE_SIZE 32
+#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+typedef struct {
+    real4 posq;
+    real3 force, dipole, inducedDipole, inducedDipolePolar;
+    real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
+    real quadrupoleYY, quadrupoleYZ;
+    float thole, damp;
+} AtomData;
+__device__ void computeOneInteractionF1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real& energy, real3& outputForce);
+__device__ void computeOneInteractionT1(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce);
+__device__ void computeOneInteractionT3(AtomData& atom1, volatile AtomData& atom2, float dScale, float pScale, float mScale, real3& outputForce);
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
+        const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+    data.posq = posq[atom];
+    data.dipole.x = labFrameDipole[atom*3];
+    data.dipole.y = labFrameDipole[atom*3+1];
+    data.dipole.z = labFrameDipole[atom*3+2];
+    data.quadrupoleXX = labFrameQuadrupole[atom*5];
+    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
+    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
+    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
+    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
+    data.inducedDipole.x = inducedDipole[atom*3];
+    data.inducedDipole.y = inducedDipole[atom*3+1];
+    data.inducedDipole.z = inducedDipole[atom*3+2];
+    data.inducedDipolePolar.x = inducedDipolePolar[atom*3];
+    data.inducedDipolePolar.y = inducedDipolePolar[atom*3+1];
+    data.inducedDipolePolar.z = inducedDipolePolar[atom*3+2];
+    float2 temp = dampingAndThole[atom];
+    data.damp = temp.x;
+    data.thole = temp.y;
+}
+__device__ real computeDScaleFactor(unsigned int polarizationGroup) {
+    return (polarizationGroup & 1 ? 0 : 1);
+}
+__device__ float computeMScaleFactor(uint2 covalent) {
+    bool x = (covalent.x & 1);
+    bool y = (covalent.y & 1);
+    return (x ? (y ? 0.0f : 0.4f) : (y ? 0.8f : 1.0f));
+}
+__device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGroup) {
+    bool x = (covalent.x & 1);
+    bool y = (covalent.y & 1);
+    bool p = (polarizationGroup & 1);
+    return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f));
+}
+/**
+ * Compute electrostatic interactions.
+ */
+extern "C" __global__ void computeElectrostatics(
+        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
+        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
+#ifdef USE_CUTOFF
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+#endif
+        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
+        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    const unsigned int numTiles = interactionCount[0];
+    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+#else
+    const unsigned int numTiles = numTileIndices;
+    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
+    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+#endif
+    real energy = 0;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
+    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+#ifndef ENABLE_SHUFFLE
+    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
+#endif
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        unsigned int x, y;
+        AtomData data;
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+            data.force = make_real3(0);
+            // Locate the exclusion data for this tile.
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+                localData[threadIdx.x].posq = data.posq;
+                localData[threadIdx.x].dipole = data.dipole;
+                localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
+                localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
+                localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
+                localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
+                localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
+                localData[threadIdx.x].inducedDipole = data.inducedDipole;
+                localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
+                localData[threadIdx.x].thole = data.thole; // IS THIS CORRECT?
+                localData[threadIdx.x].damp = data.damp; // IS THIS CORRECT?
+                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
+                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
+                // Compute forces.
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = y*TILE_SIZE+j;
+                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        real3 tempForce;
+                        real tempEnergy;
+                        float d = computeDScaleFactor(polarizationGroup);
+                        float p = computePScaleFactor(covalent, polarizationGroup);
+                        float m = computeMScaleFactor(covalent);
+                        computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
+                        data.force += tempForce;
+                        energy += 0.5f*tempEnergy;
+                    }
+                    covalent.x >>= 1;
+                    covalent.y >>= 1;
+                    polarizationGroup >>= 1;
+                }
+                data.force *= ENERGY_SCALE_FACTOR;
+                atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0xFFFFFFFF)));
+                atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0xFFFFFFFF)));
+                atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0xFFFFFFFF)));
+                // Compute torques.
+                data.force = make_real3(0);
+                covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
+                polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = y*TILE_SIZE+j;
+                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        real3 tempForce;
+                        float d = computeDScaleFactor(polarizationGroup);
+                        float p = computePScaleFactor(covalent, polarizationGroup);
+                        float m = computeMScaleFactor(covalent);
+                        computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
+                        data.force += tempForce;
+                    }
+                    covalent.x >>= 1;
+                    covalent.y >>= 1;
+                    polarizationGroup >>= 1;
+                }
+                data.force *= ENERGY_SCALE_FACTOR;
+                atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0xFFFFFFFF)));
+                atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0xFFFFFFFF)));
+                atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0xFFFFFFFF)));
+            }
+            else {
+                // This is an off-diagonal tile.
+                unsigned int j = y*TILE_SIZE + tgx;
+                loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
+                localData[threadIdx.x].force = make_real3(0);
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.
+                        for (j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                int atom2 = tbx+j;
+                                int bufferIndex = 3*threadIdx.x;
+                                real3 dEdR1 = make_real3(0);
+                                real3 dEdR2 = make_real3(0);
+                                real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
+#ifdef USE_PERIODIC
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                                if (r2 < CUTOFF_SQUARED) {
+#endif
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    COMPUTE_INTERACTION
+#ifdef USE_CUTOFF
+                                }
+#endif
+#ifdef ENABLE_SHUFFLE
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
+                                for (int i = 16; i >= 1; i /= 2) {
+                                    dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
+                                    dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
+                                    dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += dEdR2.x;
+                                    localData[tbx+j].fy += dEdR2.y;
+                                    localData[tbx+j].fz += dEdR2.z;
+                                }
+#else
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
+                                tempBuffer[bufferIndex] = dEdR2.x;
+                                tempBuffer[bufferIndex+1] = dEdR2.y;
+                                tempBuffer[bufferIndex+2] = dEdR2.z;
+                                // Sum the forces on atom2.
+                                if (tgx % 4 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
+                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
+                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
+                                }
+#endif
+                            }
+                        }
+                    }
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+                    uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
+                    unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
+                    covalent.x = (covalent.x >> tgx) | (covalent.x << (TILE_SIZE - tgx));
+                    covalent.y = (covalent.y >> tgx) | (covalent.y << (TILE_SIZE - tgx));
+                    polarizationGroup = (polarizationGroup >> tgx) | (polarizationGroup << (TILE_SIZE - tgx));
+                    // Compute forces.
+                    unsigned int tj = tgx;
+                    for (j = 0; j < TILE_SIZE; j++) {
+                        int atom2 = y*TILE_SIZE+tj;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            real3 tempForce;
+                            real tempEnergy;
+                            float d = computeDScaleFactor(polarizationGroup);
+                            float p = computePScaleFactor(covalent, polarizationGroup);
+                            float m = computeMScaleFactor(covalent);
+                            computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
+                            data.force += tempForce;
+                            localData[tbx+tj].force -= tempForce;
+                            energy += tempEnergy;
+                        }
+                        covalent.x >>= 1;
+                        covalent.y >>= 1;
+                        polarizationGroup >>= 1;
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                    data.force *= ENERGY_SCALE_FACTOR;
+                    localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+                    if (pos < end) {
+                        unsigned int offset = x*TILE_SIZE + tgx;
+                        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0xFFFFFFFF)));
+                        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0xFFFFFFFF)));
+                        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0xFFFFFFFF)));
+                        offset = y*TILE_SIZE + tgx;
+                        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF)));
+                        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF)));
+                        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF)));
+                    }
+                    // Compute torques.
+                    covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
+                    polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
+                    covalent.x = (covalent.x >> tgx) | (covalent.x << (TILE_SIZE - tgx));
+                    covalent.y = (covalent.y >> tgx) | (covalent.y << (TILE_SIZE - tgx));
+                    polarizationGroup = (polarizationGroup >> tgx) | (polarizationGroup << (TILE_SIZE - tgx));
+                    for (j = 0; j < TILE_SIZE; j++) {
+                        int atom2 = y*TILE_SIZE+tj;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            real3 tempForce;
+                            float d = computeDScaleFactor(polarizationGroup);
+                            float p = computePScaleFactor(covalent, polarizationGroup);
+                            float m = computeMScaleFactor(covalent);
+                            computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
+                            data.force += tempForce;
+                            computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
+                            localData[tbx+tj].force += tempForce;
+                        }
+                        covalent.x >>= 1;
+                        covalent.y >>= 1;
+                        polarizationGroup >>= 1;
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                    data.force *= ENERGY_SCALE_FACTOR;
+                    localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
+                    if (pos < end) {
+                        unsigned int offset = x*TILE_SIZE + tgx;
+                        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0xFFFFFFFF)));
+                        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0xFFFFFFFF)));
+                        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0xFFFFFFFF)));
+                        offset = y*TILE_SIZE + tgx;
+                        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF)));
+                        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF)));
+                        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF)));
+                    }
+                }
+            }
+        }
+        pos++;
+    } while (pos < end);
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleFixedField.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleFixedField.cu
@@ -14,12 +14,12 @@ inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __res
    data.dipole.x = labFrameDipole[atom*3];
    data.dipole.y = labFrameDipole[atom*3+1];
    data.dipole.z = labFrameDipole[atom*3+2];
-    data.quadrupoleXX = labFrameQuadrupole[atom*9];
+    data.quadrupoleXX = labFrameQuadrupole[atom*5];
-    data.quadrupoleXY = labFrameQuadrupole[atom*9+1];
+    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
-    data.quadrupoleXZ = labFrameQuadrupole[atom*9+2];
+    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
-    data.quadrupoleYY = labFrameQuadrupole[atom*9+4];
+    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
-    data.quadrupoleYZ = labFrameQuadrupole[atom*9+5];
+    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
-    data.quadrupoleZZ = labFrameQuadrupole[atom*9+8];
+    data.quadrupoleZZ = 1-data.quadrupoleXX-data.quadrupoleYY;
    float2 temp = dampingAndThole[atom];
    data.damp = temp.x;
    data.thole = temp.y;
@@ -38,7 +38,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    float damp = atom1.damp*atom2.damp;
    real dampExp;
-    if (damp != 0 && r < SCALING_DISTANCE_CUTOFF) {
+    if (damp != 0) {
        // get scaling factors
@@ -83,7 +83,7 @@ __device__ real computeDScaleFactor(unsigned int polarizationGroup) {
    return (polarizationGroup & 1 ? 0 : 1);
 }
-__device__ float computeDScaleFactor(uint2 covalent) {
+__device__ float computeMScaleFactor(uint2 covalent) {
 //        int f1 = (value == 0 || value == 1 ? 1 : 0);
 //        int f2 = (value == 0 || value == 2 ? 1 : 0);
    // 0 = 12 or 13: x and y: 0
@@ -106,7 +106,7 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
 */
 extern "C" __global__ void computeFixedField(
        unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
-        const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
 #ifdef USE_CUTOFF
        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
@@ -185,11 +185,9 @@ extern "C" __global__ void computeFixedField(
                localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
                localData[localAtomIndex].thole = data.thole; // IS THIS CORRECT?
                localData[localAtomIndex].damp = data.damp; // IS THIS CORRECT?
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    bool isExcluded = !(excl & 0x1);
                    int atom2 = tbx+j;
                    real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
 #ifdef USE_PERIODIC
@@ -200,13 +198,13 @@ extern "C" __global__ void computeFixedField(
                    real3 field1;
                    real3 field2;
                    computeOneInteraction(data, localData[atom2], delta, field1, field2);
-                    if (!isExcluded) {
+                    atom2 = y*TILE_SIZE+j;
-                        float d = computeDScaleFactor(covalent);
+                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        float d = computeDScaleFactor(polarizationGroup);
                        data.field += d*field1;
                        float p = computePScaleFactor(covalent, polarizationGroup);
                        data.fieldPolar += p*field1;
                    }
-                    excl >>= 1;
                    covalent.x >>= 1;
                    covalent.y >>= 1;
                    polarizationGroup >>= 1;
@@ -231,7 +229,6 @@ extern "C" __global__ void computeFixedField(
                        for (j = 0; j < TILE_SIZE; j++) {
                            if ((flags&(1<<j)) != 0) {
-                                bool isExcluded = false;
                                int atom2 = tbx+j;
                                int bufferIndex = 3*threadIdx.x;
                                real3 dEdR1 = make_real3(0);
@@ -298,16 +295,13 @@ extern "C" __global__ void computeFixedField(
                {
                    // Compute the full set of interactions in this tile.
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
                    uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
                    unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
-                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
                    covalent.x = (covalent.x >> tgx) | (covalent.x << (TILE_SIZE - tgx));
                    covalent.y = (covalent.y >> tgx) | (covalent.y << (TILE_SIZE - tgx));
                    polarizationGroup = (polarizationGroup >> tgx) | (polarizationGroup << (TILE_SIZE - tgx));
                    unsigned int tj = tgx;
                    for (j = 0; j < TILE_SIZE; j++) {
-                        bool isExcluded = !(excl & 0x1);
                        int atom2 = tbx+tj;
                        real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
 #ifdef USE_PERIODIC
@@ -318,15 +312,14 @@ extern "C" __global__ void computeFixedField(
                        real3 field1;
                        real3 field2;
                        computeOneInteraction(data, localData[atom2], delta, field1, field2);
-                        if (!isExcluded) {
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            float d = computeDScaleFactor(covalent);
+                            float d = computeDScaleFactor(polarizationGroup);
                            data.field += d*field1;
                            localData[atom2].field += d*field2;
                            float p = computePScaleFactor(covalent, polarizationGroup);
                            data.fieldPolar += p*field1;
                            localData[atom2].fieldPolar += p*field2;
                        }
-                        excl >>= 1;
                        covalent.x >>= 1;
                        covalent.y >>= 1;
                        polarizationGroup >>= 1;

--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoles.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoles.cu
@@ -8,10 +8,10 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
    // code common to ZThenX and Bisector
-    for (int particleIndex = blockIdx.x*blockDim.x+threadIdx.x; particleIndex < NUM_ATOMS; particleIndex += gridDim.x*blockDim.x) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += gridDim.x*blockDim.x) {
-        int4 particles = multipoleParticles[particleIndex];
+        int4 particles = multipoleParticles[atom];
        if (particles.x >= 0 && particles.z >= 0) {
-            real4 thisParticlePos = posq[particleIndex];
+            real4 thisParticlePos = posq[atom];
            real4 posZ = posq[particles.z];
            real3 vectorZ = make_real3(posZ.x-thisParticlePos.x, posZ.y-thisParticlePos.y, posZ.z-thisParticlePos.z);
            real4 posX = posq[particles.x];
@@ -149,7 +149,7 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
            // Transform the dipole
-            unsigned int offset = 3*particleIndex;
+            unsigned int offset = 3*atom;
            real molDipole[3];
            molDipole[0] = molecularDipoles[offset];
            molDipole[1] = molecularDipoles[offset+1];
@@ -164,55 +164,47 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
            // Transform the quadrupole
-            real mPole[3][3];
+            offset = 5*atom;
-            offset = 9*particleIndex;
+            real mPoleXX = molecularQuadrupoles[offset];
-            mPole[0][0] = molecularQuadrupoles[offset];
+            real mPoleXY = molecularQuadrupoles[offset+1];
-            mPole[0][1] = molecularQuadrupoles[offset+1];
+            real mPoleXZ = molecularQuadrupoles[offset+2];
-            mPole[0][2] = molecularQuadrupoles[offset+2];
+            real mPoleYY = molecularQuadrupoles[offset+3];
+            real mPoleYZ = molecularQuadrupoles[offset+4];
-            mPole[1][0] = molecularQuadrupoles[offset+3];
+            real mPoleZZ = 1-mPoleXX-mPoleYY;
-            mPole[1][1] = molecularQuadrupoles[offset+4];
-            mPole[1][2] = molecularQuadrupoles[offset+5];
-            mPole[2][0] = molecularQuadrupoles[offset+6];
-            mPole[2][1] = molecularQuadrupoles[offset+7];
-            mPole[2][2] = molecularQuadrupoles[offset+8];
            if (reverse) {
-                mPole[0][1] *= -1;
+                mPoleXY *= -1;
-                mPole[1][0] *= -1;
+                mPoleYZ *= -1;
-                mPole[1][2] *= -1;
-                mPole[2][1] *= -1;
            }
-            labFrameQuadrupoles[offset+8] = vectorX.z*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+            labFrameQuadrupoles[offset] = vectorX.x*(vectorX.x*mPoleXX + vectorY.x*mPoleXY + vectorZ.x*mPoleXZ);
-            labFrameQuadrupoles[offset+8] += vectorY.z*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+                                        + vectorY.x*(vectorX.x*mPoleXY + vectorY.x*mPoleYY + vectorZ.x*mPoleYZ);
-            labFrameQuadrupoles[offset+8] += vectorZ.z*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+                                        + vectorZ.x*(vectorX.x*mPoleXZ + vectorY.x*mPoleYZ + vectorZ.x*mPoleZZ);
+            labFrameQuadrupoles[offset+1] = vectorX.x*(vectorX.y*mPoleXX + vectorY.y*mPoleXY + vectorZ.y*mPoleXZ);
-            labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
+                                        + vectorY.x*(vectorX.y*mPoleXY + vectorY.y*mPoleYY + vectorZ.y*mPoleYZ);
-            labFrameQuadrupoles[offset+4] += vectorY.y*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
+                                        + vectorZ.x*(vectorX.y*mPoleXZ + vectorY.y*mPoleYZ + vectorZ.y*mPoleZZ);
-            labFrameQuadrupoles[offset+4] += vectorZ.y*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
+            labFrameQuadrupoles[offset+2] = vectorX.x*(vectorX.z*mPoleXX + vectorY.z*mPoleXY + vectorZ.z*mPoleXZ);
+                                        + vectorY.x*(vectorX.z*mPoleXY + vectorY.z*mPoleYY + vectorZ.z*mPoleYZ);
-            labFrameQuadrupoles[offset+5] = vectorX.y*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+                                        + vectorZ.x*(vectorX.z*mPoleXZ + vectorY.z*mPoleYZ + vectorZ.z*mPoleZZ);
-            labFrameQuadrupoles[offset+5] += vectorY.y*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+            labFrameQuadrupoles[offset+3] = vectorX.y*(vectorX.y*mPoleXX + vectorY.y*mPoleXY + vectorZ.y*mPoleXZ);
-            labFrameQuadrupoles[offset+5] += vectorZ.y*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+                                        + vectorY.y*(vectorX.y*mPoleXY + vectorY.y*mPoleYY + vectorZ.y*mPoleYZ);
+                                        + vectorZ.y*(vectorX.y*mPoleXZ + vectorY.y*mPoleYZ + vectorZ.y*mPoleZZ);
-            labFrameQuadrupoles[offset] = vectorX.x*(vectorX.x*mPole[0][0] + vectorY.x*mPole[0][1] + vectorZ.x*mPole[0][2]);
+            labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.z*mPoleXX + vectorY.z*mPoleXY + vectorZ.z*mPoleXZ);
-            labFrameQuadrupoles[offset] += vectorY.x*(vectorX.x*mPole[1][0] + vectorY.x*mPole[1][1] + vectorZ.x*mPole[1][2]);
+                                        + vectorY.y*(vectorX.z*mPoleXY + vectorY.z*mPoleYY + vectorZ.z*mPoleYZ);
-            labFrameQuadrupoles[offset] += vectorZ.x*(vectorX.x*mPole[2][0] + vectorY.x*mPole[2][1] + vectorZ.x*mPole[2][2]);
+                                        + vectorZ.y*(vectorX.z*mPoleXZ + vectorY.z*mPoleYZ + vectorZ.z*mPoleZZ);
-            labFrameQuadrupoles[offset+1] = vectorX.x*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
-            labFrameQuadrupoles[offset+1] += vectorY.x*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
-            labFrameQuadrupoles[offset+1] += vectorZ.x*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
-            labFrameQuadrupoles[offset+2] = vectorX.x*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
-            labFrameQuadrupoles[offset+2] += vectorY.x*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
-            labFrameQuadrupoles[offset+2] += vectorZ.x*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
-            labFrameQuadrupoles[offset+3] = labFrameQuadrupoles[offset+1];
-            labFrameQuadrupoles[offset+6] = labFrameQuadrupoles[offset+2];
-            labFrameQuadrupoles[offset+7] = labFrameQuadrupoles[offset+5];
        }
    }
 }
+extern "C" __global__ void recordInducedDipoles(const long long* __restrict__ fieldBuffers, const long long* __restrict__ fieldPolarBuffers,
+        real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += gridDim.x*blockDim.x) {
+        real scale = polarizability[atom]/(real) 0xFFFFFFFF;
+        inducedDipole[3*atom] = scale*fieldBuffers[atom];
+        inducedDipole[3*atom+1] = scale*fieldBuffers[atom+PADDED_NUM_ATOMS];
+        inducedDipole[3*atom+2] = scale*fieldBuffers[atom+PADDED_NUM_ATOMS*2];
+        inducedDipolePolar[3*atom] = scale*fieldPolarBuffers[atom];
+        inducedDipolePolar[3*atom+1] = scale*fieldPolarBuffers[atom+PADDED_NUM_ATOMS];
+        inducedDipolePolar[3*atom+2] = scale*fieldPolarBuffers[atom+PADDED_NUM_ATOMS*2];
+    }
+}
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaVdwForce.cpp
+++ b/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaVdwForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
 * Authors: Mark Friedrichs                                                   *
 * Contributors:                                                              *
 *                                                                            *
@@ -42,6 +42,13 @@
 #include "openmm/LangevinIntegrator.h"
 #include <iostream>
 #include <vector>
+#include <stdlib.h>
+#include <stdio.h>
+#define ASSERT_EQUAL_TOL_MOD(expected, found, tol, testname) {double _scale_ = std::abs(expected) > 1.0 ? std::abs(expected) : 1.0; if (!(std::abs((expected)-(found))/_scale_ <= (tol))) {std::stringstream details; details << testname << " Expected "<<(expected)<<", found "<<(found); throwException(__FILE__, __LINE__, details.str());}};
+#define ASSERT_EQUAL_VEC_MOD(expected, found, tol,testname) {ASSERT_EQUAL_TOL_MOD((expected)[0], (found)[0], (tol),(testname)); ASSERT_EQUAL_TOL_MOD((expected)[1], (found)[1], (tol),(testname)); ASSERT_EQUAL_TOL_MOD((expected)[2], (found)[2], (tol),(testname));};
 using namespace OpenMM;
 const double TOL = 1e-4;
@@ -56,22 +63,20 @@ void testVdw( FILE* log ) {
    std::string epsilonCombiningRule = std::string("HHG");
    amoebaVdwForce->setEpsilonCombiningRule( epsilonCombiningRule );
+    int classIndex = 0;
    for( int ii = 0; ii < numberOfParticles; ii++ ){
-        int indexIV, indexClass;
+        int indexIV;
        double mass, sigma, epsilon, reduction;
        std::vector< int > exclusions;
        if( ii == 0 || ii == 3 ){
            mass        = 16.0;
            indexIV     = ii;
-            indexClass  = 70;
            sigma       = 1.70250E+00;
            epsilon     = 1.10000E-01;
            reduction   = 0.0;
        } else {
            mass        = 1.0;
            indexIV     = ii < 3 ? 0 : 3;
-            indexClass  = 71;
            sigma       = 1.32750E+00;
            epsilon     = 1.35000E-02;
            reduction   = 0.91;
@@ -89,7 +94,7 @@ void testVdw( FILE* log ) {
            exclusions.push_back ( 5 );
        }
        system.addParticle(mass);
-        amoebaVdwForce->addParticle( indexIV, indexClass, sigma, epsilon, reduction );
+        amoebaVdwForce->addParticle( indexIV, classIndex, sigma, epsilon, reduction );
        amoebaVdwForce->setParticleExclusions( ii, exclusions );
    }
    LangevinIntegrator integrator(0.0, 0.1, 0.01);
@@ -130,12 +135,13 @@ void testVdw( FILE* log ) {
        positions[ii][2] *= AngstromToNm;
    }
    for( int ii = 0; ii < amoebaVdwForce->getNumParticles();  ii++ ){
-        int indexIV, indexClass;
+        int indexIV;
+        int classIndex;
        double sigma, epsilon, reduction;
-        amoebaVdwForce->getParticleParameters( ii, indexIV, indexClass, sigma, epsilon, reduction );
+        amoebaVdwForce->getParticleParameters( ii, indexIV, classIndex, sigma, epsilon, reduction );
        sigma        *= AngstromToNm;
        epsilon      *= CalToJoule;
-        amoebaVdwForce->setParticleParameters( ii, indexIV, indexClass, sigma, epsilon, reduction );
+        amoebaVdwForce->setParticleParameters( ii, indexIV, classIndex, sigma, epsilon, reduction );
    }
    platformName = "CUDA";
    Context context(system, integrator, Platform::getPlatformByName( platformName ) );
@@ -170,6 +176,342 @@ void testVdw( FILE* log ) {
    ASSERT_EQUAL_TOL( expectedEnergy, state.getPotentialEnergy(), tolerance );
 }
+void setupAndGetForcesEnergyVdwAmmonia( const std::string& sigmaCombiningRule, const std::string& epsilonCombiningRule, double cutoff,
+                                        double boxDimension, std::vector<Vec3>& forces, double& energy, FILE* log ){
+    // beginning of Vdw setup
+    System system;
+    AmoebaVdwForce* amoebaVdwForce        = new AmoebaVdwForce();;
+    int numberOfParticles                 = 8;
+    amoebaVdwForce->setSigmaCombiningRule( sigmaCombiningRule );
+    amoebaVdwForce->setEpsilonCombiningRule( epsilonCombiningRule );
+    amoebaVdwForce->setUseNeighborList( 1 );
+    amoebaVdwForce->setCutoff( cutoff );
+    if( boxDimension > 0.0 ){
+        Vec3 a( boxDimension, 0.0, 0.0 );
+        Vec3 b( 0.0, boxDimension, 0.0 );
+        Vec3 c( 0.0, 0.0, boxDimension );
+        system.setDefaultPeriodicBoxVectors( a, b, c );
+        amoebaVdwForce->setPBC( 1 );
+    } else {
+        amoebaVdwForce->setPBC( 0 );
+    }
+    // addParticle: ivIndex, radius, epsilon, reductionFactor
+    int classIndex = 0;
+    system.addParticle(   1.4007000e+01 );
+    amoebaVdwForce->addParticle( 0, classIndex,   1.8550000e-01,   4.3932000e-01,   0.0000000e+00 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 0, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 0, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 0, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.4007000e+01 );
+    amoebaVdwForce->addParticle( 4, classIndex,   1.8550000e-01,   4.3932000e-01,   0.0000000e+00 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 4, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 4, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 4, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    // ParticleExclusions
+    std::vector< int > exclusions;
+    exclusions.resize(0);
+    exclusions.push_back( 0 );
+    exclusions.push_back( 1 );
+    exclusions.push_back( 2 );
+    exclusions.push_back( 3 );
+    amoebaVdwForce->setParticleExclusions( 0, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 1 );
+    exclusions.push_back( 0 );
+    exclusions.push_back( 2 );
+    exclusions.push_back( 3 );
+    amoebaVdwForce->setParticleExclusions( 1, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 2 );
+    exclusions.push_back( 0 );
+    exclusions.push_back( 1 );
+    exclusions.push_back( 3 );
+    amoebaVdwForce->setParticleExclusions( 2, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 3 );
+    exclusions.push_back( 0 );
+    exclusions.push_back( 1 );
+    exclusions.push_back( 2 );
+    amoebaVdwForce->setParticleExclusions( 3, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 4 );
+    exclusions.push_back( 5 );
+    exclusions.push_back( 6 );
+    exclusions.push_back( 7 );
+    amoebaVdwForce->setParticleExclusions( 4, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 5 );
+    exclusions.push_back( 4 );
+    exclusions.push_back( 6 );
+    exclusions.push_back( 7 );
+    amoebaVdwForce->setParticleExclusions( 5, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 6 );
+    exclusions.push_back( 4 );
+    exclusions.push_back( 5 );
+    exclusions.push_back( 7 );
+    amoebaVdwForce->setParticleExclusions( 6, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 7 );
+    exclusions.push_back( 4 );
+    exclusions.push_back( 5 );
+    exclusions.push_back( 6 );
+    amoebaVdwForce->setParticleExclusions( 7, exclusions );
+    // end of Vdw setup
+    std::vector<Vec3> positions(numberOfParticles);
+    positions[0]              = Vec3(   1.5927280e-01,   1.7000000e-06,    1.6491000e-03 );
+    positions[1]              = Vec3(   2.0805540e-01,  -8.1258800e-02,    3.7282500e-02 );
+    positions[2]              = Vec3(   2.0843610e-01,   8.0953200e-02,    3.7462200e-02 );
+    positions[3]              = Vec3(   1.7280780e-01,   2.0730000e-04,   -9.8741700e-02 );
+    positions[4]              = Vec3(  -1.6743680e-01,   1.5900000e-05,   -6.6149000e-03 );
+    positions[5]              = Vec3(  -2.0428260e-01,   8.1071500e-02,    4.1343900e-02 );
+    positions[6]              = Vec3(  -6.7308300e-02,   1.2800000e-05,    1.0623300e-02 );
+    positions[7]              = Vec3(  -2.0426290e-01,  -8.1231400e-02,    4.1033500e-02 );
+    system.addForce(amoebaVdwForce);
+    std::string platformName;
+    platformName = "CUDA";
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName( platformName ) );
+    context.setPositions(positions);
+    State state                      = context.getState(State::Forces | State::Energy);
+    forces                           = state.getForces();
+    energy                           = state.getPotentialEnergy();
+}
+void compareForcesEnergy( std::string& testName, double expectedEnergy, double energy,
+                          std::vector<Vec3>& expectedForces,
+                          std::vector<Vec3>& forces, double tolerance, FILE* log ) {
+#ifdef AMOEBA_DEBUG
+    if( log ){
+        (void) fprintf( log, "%s: expected energy=%14.7e %14.7e\n", testName.c_str(), expectedEnergy, state.getPotentialEnergy() );
+        for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+            (void) fprintf( log, "%6u [%14.7e %14.7e %14.7e]   [%14.7e %14.7e %14.7e]\n", ii,
+                            expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forces[ii][0], forces[ii][1], forces[ii][2] );
+        }
+        (void) fflush( log );
+    }
+#endif
+    for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+        ASSERT_EQUAL_VEC_MOD( expectedForces[ii], forces[ii], tolerance, testName );
+    }
+    ASSERT_EQUAL_TOL_MOD( expectedEnergy, energy, tolerance, testName );
+}
+// test VDW w/ sigmaRule=CubicMean and epsilonRule=HHG
+void testVdwAmmoniaCubicMeanHhg( FILE* log ) {
+    std::string testName      = "testVdwAmmoniaCubicMeanHhg";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "CUBIC-MEAN", "HHG", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  4.8012258e+00;
+    expectedForces[0]         = Vec3(   2.9265247e+02,  -1.4507808e-02,  -6.9562123e+00 );
+    expectedForces[1]         = Vec3(  -2.2451693e+00,   4.8143073e-01,  -2.0041494e-01 );
+    expectedForces[2]         = Vec3(  -2.2440698e+00,  -4.7905450e-01,  -2.0125284e-01 );
+    expectedForces[3]         = Vec3(  -1.0840394e+00,  -5.8531253e-04,   2.6934135e-01 );
+    expectedForces[4]         = Vec3(  -5.6305662e+01,   1.4733908e-03,  -1.8083306e-01 );
+    expectedForces[5]         = Vec3(   1.6750145e+00,  -3.2448374e-01,  -1.8030914e-01 );
+    expectedForces[6]         = Vec3(  -2.3412420e+02,   1.0754069e-02,   7.6287492e+00 );
+    expectedForces[7]         = Vec3(   1.6756544e+00,   3.2497316e-01,  -1.7906832e-01 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+// test VDW w/ sigmaRule=Arithmetic and epsilonRule=Arithmetic
+void testVdwAmmoniaArithmeticArithmetic( FILE* log ) {
+    std::string testName      = "testVdwAmmoniaArithmeticArithmetic";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "ARITHMETIC", "ARITHMETIC", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  4.2252403e+00;
+    expectedForces[0]         = Vec3(   3.0603839e+02,  -1.5550310e-02,  -7.2661707e+00 );
+    expectedForces[1]         = Vec3(  -2.7801357e+00,   5.8805051e-01,  -2.5907269e-01 );
+    expectedForces[2]         = Vec3(  -2.7753968e+00,  -5.8440732e-01,  -2.5969111e-01 );
+    expectedForces[3]         = Vec3(  -2.2496416e+00,  -1.1797440e-03,   5.5501757e-01 );
+    expectedForces[4]         = Vec3(  -5.5077629e+01,   8.3417114e-04,  -3.3668921e-01 );
+    expectedForces[5]         = Vec3(   2.3752452e+00,  -4.6788669e-01,  -2.4907764e-01 );
+    expectedForces[6]         = Vec3(  -2.4790697e+02,   1.1419770e-02,   8.0629999e+00 );
+    expectedForces[7]         = Vec3(   2.3761408e+00,   4.6871961e-01,  -2.4731607e-01 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+// test VDW w/ sigmaRule=Geometric and epsilonRule=Geometric
+void testVdwAmmoniaGeometricGeometric( FILE* log ) {
+    std::string testName      = "testVdwAmmoniaGeometricGeometric";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "GEOMETRIC", "GEOMETRIC", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  2.5249914e+00;
+    expectedForces[0]         = Vec3(   2.1169631e+02,  -1.0710925e-02,  -4.3728025e+00 );
+    expectedForces[1]         = Vec3(  -2.2585621e+00,   4.8409995e-01,  -2.0188344e-01 );
+    expectedForces[2]         = Vec3(  -2.2551351e+00,  -4.8124855e-01,  -2.0246986e-01 );
+    expectedForces[3]         = Vec3(  -1.7178028e+00,  -9.0851787e-04,   4.2466975e-01 );
+    expectedForces[4]         = Vec3(  -4.8302147e+01,   9.6603376e-04,  -5.7972068e-01 );
+    expectedForces[5]         = Vec3(   1.8100634e+00,  -3.5214093e-01,  -1.9357207e-01 );
+    expectedForces[6]         = Vec3(  -1.6078365e+02,   7.2117601e-03,   5.3180261e+00 );
+    expectedForces[7]         = Vec3(   1.8109211e+00,   3.5273117e-01,  -1.9224723e-01 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+void testVdwAmmoniaCubicMeanHarmonic( FILE* log ) {
+    std::string testName      = "testVdwAmmoniaCubicMeanHarmonic";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "CUBIC-MEAN", "HARMONIC", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  4.1369069e+00;
+    expectedForces[0]         = Vec3(   2.5854436e+02,  -1.2779529e-02,  -5.9041148e+00 );
+    expectedForces[1]         = Vec3(  -2.0832419e+00,   4.4915831e-01,  -1.8266000e-01 );
+    expectedForces[2]         = Vec3(  -2.0823991e+00,  -4.4699804e-01,  -1.8347141e-01 );
+    expectedForces[3]         = Vec3(  -9.5914714e-01,  -5.2162026e-04,   2.3873165e-01 );
+    expectedForces[4]         = Vec3(  -5.3724787e+01,   1.4838241e-03,  -2.8089191e-01 );
+    expectedForces[5]         = Vec3(   1.5074325e+00,  -2.9016397e-01,  -1.6385118e-01 );
+    expectedForces[6]         = Vec3(  -2.0271029e+02,   9.2367947e-03,   6.6389988e+00 );
+    expectedForces[7]         = Vec3(   1.5080748e+00,   2.9058422e-01,  -1.6274118e-01 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+// test w/ cutoff=0.25 nm; single ixn between two particles (0 and 6); force nonzero on
+// particle 4 due to reduction applied to NH
+// the distance between 0 and 6 is ~ 0.235 so the ixn is in the tapered region
+void testVdwTaper( FILE* log ) {
+    std::string testName      = "testVdwTaper";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 0.25;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "CUBIC-MEAN", "HHG", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  3.5478444e+00;
+    expectedForces[0]         = Vec3(   5.6710779e+02,  -2.7391004e-02,  -1.7867730e+01 );
+    expectedForces[1]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[2]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[3]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[4]         = Vec3(  -5.1039701e+01,   2.4651903e-03,   1.6080957e+00 );
+    expectedForces[5]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[6]         = Vec3(  -5.1606809e+02,   2.4925813e-02,   1.6259634e+01 );
+    expectedForces[7]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+// test PBC
+void testVdwPBC( FILE* log ) {
+    std::string testName      = "testVdwPBC";
+    int numberOfParticles     = 8;
+    double boxDimension       = 0.6;
+    double cutoff             = 0.25;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "CUBIC-MEAN", "HHG", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  8.4385405e+00;
+    expectedForces[0]         = Vec3(   5.1453069e+02,   4.9751912e-01,  -1.2759570e+01 );
+    expectedForces[1]         = Vec3(  -2.5622586e+02,  -4.6524265e+01,   2.4281465e+01 );
+    expectedForces[2]         = Vec3(  -2.7538705e+02,   5.1831690e+01,   2.7367710e+01 );
+    expectedForces[3]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[4]         = Vec3(   3.0883034e+02,  -5.8876974e+00,  -5.8286122e+01 );
+    expectedForces[5]         = Vec3(   1.1319359e+02,  -3.2047069e-01,   1.6181231e+00 );
+    expectedForces[6]         = Vec3(  -5.1606809e+02,   2.4925813e-02,   1.6259634e+01 );
+    expectedForces[7]         = Vec3(   1.1112638e+02,   3.7829857e-01,   1.5187587e+00 );
+    // tolerance is higher here due to interpolation used in setting tapering coefficients;
+    // if tapering turned off, then absolute difference < 2.0e-05
+    double tolerance          = 5.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
 int main( int numberOfArguments, char* argv[] ) {
@@ -178,7 +520,37 @@ int main( int numberOfArguments, char* argv[] ) {
        registerAmoebaCudaKernelFactories();
        FILE* log = NULL;
        testVdw( log );
+        // tests using two ammonia molecules
+        // test VDW w/ sigmaRule=CubicMean and epsilonRule=HHG
+        testVdwAmmoniaCubicMeanHhg( log );
+        // test VDW w/ sigmaRule=Arithmetic and epsilonRule=Arithmetic
+        testVdwAmmoniaArithmeticArithmetic( log );
+        // test VDW w/ sigmaRule=Geometric and epsilonRule=Geometric
+        testVdwAmmoniaGeometricGeometric( log );
+        // test VDW w/ sigmaRule=CubicMean and epsilonRule=Harmonic
+        testVdwAmmoniaCubicMeanHarmonic( log );
+        // test w/ cutoff=0.25 nm; single ixn between two particles (0 and 6); force nonzero on
+        // particle 4 due to reduction applied to NH
+        // the distance between 0 and 6 is ~ 0.235 so the ixn is in the tapered region
+        testVdwTaper( log );
+        // test PBC
+        testVdwPBC( log );
    } catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;
        std::cout << "FAIL - ERROR.  Test failed." << std::endl;