Continuing to convert AmoebaMultipoleForce

776f6f22 · Peter Eastman · b186314c · 776f6f22 · 776f6f22 · 776f6f22
Commit 776f6f22 authored Jul 28, 2012 by Peter Eastman
8 changed files
--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
@@ -865,7 +865,7 @@ private:
 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : 
        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false),
-        multipoleParticles(NULL), torqueBufferIndices(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL),
+        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL),
        labFrameDipoles(NULL), labFrameQuadrupoles(NULL), field(NULL), fieldPolar(NULL), dampingAndThole(NULL),
        inducedDipole(NULL), inducedDipolePolar(NULL), currentEpsilon(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL) {
@@ -875,8 +875,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
    cu.setAsCurrent();
    if (multipoleParticles != NULL)
        delete multipoleParticles;
-    if (torqueBufferIndices != NULL)
-        delete torqueBufferIndices;
    if (molecularDipoles != NULL)
        delete molecularDipoles;
    if (molecularQuadrupoles != NULL)
@@ -933,8 +931,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        multipoleParticlesVec.push_back(make_int4(atomX, atomY, atomZ, axisType));
        for (int j = 0; j < 3; j++)
            molecularDipolesVec.push_back((float) dipole[j]);
-        for (int j = 0; j < 9; j++)
+        molecularQuadrupolesVec.push_back((float) quadrupole[0]);
-            molecularQuadrupolesVec.push_back((float) quadrupole[j]);
+        molecularQuadrupolesVec.push_back((float) quadrupole[1]);
+        molecularQuadrupolesVec.push_back((float) quadrupole[2]);
+        molecularQuadrupolesVec.push_back((float) quadrupole[4]);
+        molecularQuadrupolesVec.push_back((float) quadrupole[5]);
    }
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    for (int i = numMultipoles; i < paddedNumAtoms; i++) {
@@ -943,14 +944,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        multipoleParticlesVec.push_back(make_int4(0, 0, 0, 0));
        for (int j = 0; j < 3; j++)
            molecularDipolesVec.push_back(0);
-        for (int j = 0; j < 9; j++)
+        for (int j = 0; j < 5; j++)
            molecularQuadrupolesVec.push_back(0);
    }
    dampingAndThole = CudaArray::create<float2>(cu, paddedNumAtoms, "dampingAndThole");
    polarizability = CudaArray::create<float>(cu, paddedNumAtoms, "polarizability");
    multipoleParticles = CudaArray::create<int4>(cu, paddedNumAtoms, "multipoleParticles");
    molecularDipoles = CudaArray::create<float>(cu, 3*paddedNumAtoms, "molecularDipoles");
-    molecularQuadrupoles = CudaArray::create<float>(cu, 9*paddedNumAtoms, "molecularQuadrupoles");
+    molecularQuadrupoles = CudaArray::create<float>(cu, 5*paddedNumAtoms, "molecularQuadrupoles");
    dampingAndThole->upload(dampingAndTholeVec);
    polarizability->upload(polarizabilityVec);
    multipoleParticles->upload(multipoleParticlesVec);
@@ -962,7 +963,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
    labFrameDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles");
-    labFrameQuadrupoles = new CudaArray(cu, 9*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
+    labFrameQuadrupoles = new CudaArray(cu, 5*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
    field = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "field");
    fieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "fieldPolar");
    inducedDipole = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipole");
@@ -999,6 +1000,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            polarizationFlagValues.push_back(make_int2(i, atoms[j]));
    }
+    // Record other options.
+    if (force.getPolarizationType() == AmoebaMultipoleForce::Mutual) {
+        maxInducedIterations = force.getMutualInducedMaxIterations();
+        inducedEpsilon = force.getMutualInducedTargetEpsilon();
+    }
+    else
+        maxInducedIterations == 0;
    // Create the kernels.
@@ -1007,13 +1017,29 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(numMultipoles);
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-    defines["SCALING_DISTANCE_CUTOFF"] = cu.doubleToString(50.0);
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+    defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456); // DIVIDE BY INNER DIELECTRIC!!!
+    if (force.getPolarizationType() == AmoebaMultipoleForce::Direct)
+        defines["DIRECT_POLARIZATION"] = "";
    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoles, defines);
    computeMomentsKernel = cu.getKernel(module, "computeLabFrameMoments");
+    recordInducedDipolesKernel = cu.getKernel(module, "recordInducedDipoles");
    module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines);
    computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
+    stringstream electrostaticsSource;
+    electrostaticsSource << CudaKernelSources::vectorOps;
+    electrostaticsSource << CudaAmoebaKernelSources::multipoleElectrostatics;
+    electrostaticsSource << "#define F1\n";
+    electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+    electrostaticsSource << "#undef F1\n";
+    electrostaticsSource << "#define T1\n";
+    electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+    electrostaticsSource << "#undef T1\n";
+    electrostaticsSource << "#define T2\n";
+    electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
+    module = cu.createModule(electrostaticsSource.str(), defines);
+    electrostaticsKernel = cu.getKernel(module, "computeElectrostatics");
    // Set up PME.
@@ -1389,30 +1415,34 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
    void* computeMomentsArgs[] = {&cu.getPosq().getDevicePointer(), &multipoleParticles->getDevicePointer(),
        &molecularDipoles->getDevicePointer(), &molecularQuadrupoles->getDevicePointer(),
        &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer()};
-    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getPaddedNumAtoms());
+    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getNumAtoms());
-    vector<float> d, q;
-    labFrameDipoles->download(d);
-    labFrameQuadrupoles->download(q);
-    for (int i = 0; i < cu.getNumAtoms(); i++)
-        printf("%d %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2]);
-    for (int i = 0; i < cu.getNumAtoms(); i++)
-        printf("%d %g %g %g %g %g %g %g %g %g\n", i, q[9*i], q[9*i+1], q[9*i+2], q[9*i+3], q[9*i+4], q[9*i+5], q[9*i+6], q[9*i+7], q[9*i+8]);
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
    int forceThreadBlockSize = nb.getForceThreadBlockSize();
    if (pmeGrid == NULL) {
+        // Compute induced dipoles.
        void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-            &nb.getExclusions().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+            &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
        cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
-        vector<unsigned long long> f;
+        void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
-        field->download(f);
+            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()};
-        int pad = cu.getPaddedNumAtoms();
+        cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
-        for (int i = 0; i < cu.getNumAtoms(); i++) {
+        for (int i = 0; i < maxInducedIterations; i++) {
-            printf("%d %g %g %g\n", i, f[i]/(double)0xFFFFFFFF, f[i+pad]/(double)0xFFFFFFFF, f[i+pad*2]/(double)0xFFFFFFFF);
        }
+        // Compute electrostatic force.
+        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
+            &cu.getPosq().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
+            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
+            &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
    return 0.0;
 }
@@ -1598,28 +1628,22 @@ void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const Amoeba
        throw OpenMMException("Illegal combining rule for sigma: "+sigmaCombiningRule);
    string epsilonCombiningRule = force.getEpsilonCombiningRule();
    if (epsilonCombiningRule == "ARITHMETIC")
-        replacements["EPILON_COMBINING_RULE"] = "1";
+        replacements["EPSILON_COMBINING_RULE"] = "1";
    else if (epsilonCombiningRule == "GEOMETRIC")
-        replacements["EPILON_COMBINING_RULE"] = "2";
+        replacements["EPSILON_COMBINING_RULE"] = "2";
    else if (epsilonCombiningRule == "HARMONIC")
-        replacements["EPILON_COMBINING_RULE"] = "3";
+        replacements["EPSILON_COMBINING_RULE"] = "3";
    else if (epsilonCombiningRule == "HHG")
-        replacements["EPILON_COMBINING_RULE"] = "4";
+        replacements["EPSILON_COMBINING_RULE"] = "4";
    else
        throw OpenMMException("Illegal combining rule for sigma: "+sigmaCombiningRule);
    double cutoff = force.getCutoff();
    double taperCutoff = cutoff*0.9;
    replacements["CUTOFF_DISTANCE"] = cu.doubleToString(force.getCutoff());
    replacements["TAPER_CUTOFF"] = cu.doubleToString(taperCutoff);
-    double cutoff2 = cutoff*cutoff;
+    replacements["TAPER_C3"] = cu.doubleToString(10/pow(taperCutoff-cutoff, 3.0));
-    double taperCutoff2 = taperCutoff*taperCutoff;
+    replacements["TAPER_C4"] = cu.doubleToString(15/pow(taperCutoff-cutoff, 4.0));
-    double denom = pow(cutoff-taperCutoff, -5.0);
+    replacements["TAPER_C5"] = cu.doubleToString(6/pow(taperCutoff-cutoff, 5.0));
-    replacements["TAPER_C0"] = cu.doubleToString(cutoff*cutoff2 * (cutoff2-5.0*cutoff*taperCutoff+10.0*taperCutoff2)*denom);
-    replacements["TAPER_C1"] = cu.doubleToString(-30.0*cutoff2*taperCutoff2*denom);
-    replacements["TAPER_C2"] = cu.doubleToString(30.0*(cutoff2*taperCutoff+cutoff*taperCutoff2)*denom);
-    replacements["TAPER_C3"] = cu.doubleToString(-10.0*(cutoff2+4.0*cutoff*taperCutoff+taperCutoff2)*denom);
-    replacements["TAPER_C4"] = cu.doubleToString(15.0*(cutoff+taperCutoff)*denom);
-    replacements["TAPER_C5"] = cu.doubleToString(-6.0*denom);
    nonbonded->addInteraction(force.getUseNeighborList(), force.getPBC(), true, force.getCutoff(), exclusions,
        cu.replaceStrings(CudaAmoebaKernelSources::amoebaVdwForce2, replacements), force.getForceGroup());

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
@@ -374,14 +374,14 @@ public:
 private:
    class ForceInfo;
    void initializeScaleFactors();
-    int numMultipoles;
+    int numMultipoles, maxInducedIterations;
+    double inducedEpsilon;
    bool hasInitializedScaleFactors;
    CudaContext& cu;
    System& system;
    std::vector<int3> covalentFlagValues;
    std::vector<int2> polarizationFlagValues;
    CudaArray* multipoleParticles;
-    CudaArray* torqueBufferIndices;
    CudaArray* molecularDipoles;
    CudaArray* molecularQuadrupoles;
    CudaArray* labFrameDipoles;
@@ -412,7 +412,7 @@ private:
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
    cufftHandle fft;
-    CUfunction computeMomentsKernel, computeFixedFieldKernel;
+    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, electrostaticsKernel;
 };
 /**

--- a/plugins/amoeba/platforms/cuda2/src/kernels/amoebaVdwForce2.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/amoebaVdwForce2.cu
@@ -15,13 +15,11 @@
    real sigma = 2*(sigmaEpsilon1.x*sigma1_2 + sigmaEpsilon2.x*sigma2_2)/(sigma1_2+sigma2_2);
 #endif
 #if EPSILON_COMBINING_RULE == 1
-    real epsilon = sigmaEpsilon1.y + sigmaEpsilon2.y;
+    real epsilon = 0.5f*(sigmaEpsilon1.y + sigmaEpsilon2.y);
 #elif EPSILON_COMBINING_RULE == 2
-    real epsilon = 2*SQRT(sigmaEpsilon1.y*sigmaEpsilon2.y);
+    real epsilon = SQRT(sigmaEpsilon1.y*sigmaEpsilon2.y);
 #elif EPSILON_COMBINING_RULE == 3
-    real epsilon1_2 = sigmaEpsilon1.x*sigmaEpsilon1.x;
+    real epsilon = 2*(sigmaEpsilon1.y*sigmaEpsilon2.y)/(sigmaEpsilon1.y+sigmaEpsilon2.y);
-    real epsilon2_2 = sigmaEpsilon2.x*sigmaEpsilon2.x;
-    real epsilon = 2*(sigmaEpsilon1.x*epsilon1_2 + sigmaEpsilon2.x*epsilon2_2)/(epsilon1_2+epsilon2_2);
 #else
    real epsilon_s = SQRT(sigmaEpsilon1.y) + SQRT(sigmaEpsilon2.y);
    real epsilon = 4*sigmaEpsilon1.y*sigmaEpsilon2.y/(epsilon_s*epsilon_s);
@@ -39,13 +37,14 @@
    real tmp = sigma7*invRho;
    real gTau = epsilon*tau7*r6*1.12f*tmp*tmp;
    real termEnergy = epsilon*sigma7*tau7*((sigma7*1.12f*invRho)-2.0f);
-    real deltaE = (-7.0f*(dTau*termEnergy+gTau))*invR;
+    real deltaE = -7.0f*(dTau*termEnergy+gTau);
    if (r > TAPER_CUTOFF) {
-        real taper = TAPER_C0+r*(TAPER_C1+r*(TAPER_C2+r*(TAPER_C3+r*(TAPER_C4+r*TAPER_C5))));
+        real x = r-TAPER_CUTOFF;
-        real dtaper = TAPER_C1+r*(2*TAPER_C2+r*(3*TAPER_C3+r*(4*TAPER_C4+r*5*TAPER_C5)));
+        real taper = 1+x*x*x*(TAPER_C3+x*(TAPER_C4+x*TAPER_C5));
+        real dtaper = x*x*(3*TAPER_C3+x*(4*TAPER_C4+x*5*TAPER_C5));
        deltaE = termEnergy*dtaper + deltaE*taper;
        termEnergy *= taper;
    }
    tempEnergy += (includeInteraction ? termEnergy : 0);
-    dEdR -= (includeInteraction ? deltaE : 0);
+    dEdR -= (includeInteraction ? deltaE*invR : 0);
 }
--- a/plugins/amoeba/platforms/cuda2/src/kernels/electrostaticPairForce.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/electrostaticPairForce.cu
--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleElectrostatics.cu
--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleFixedField.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleFixedField.cu
@@ -14,12 +14,12 @@ inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __res
    data.dipole.x = labFrameDipole[atom*3];
    data.dipole.y = labFrameDipole[atom*3+1];
    data.dipole.z = labFrameDipole[atom*3+2];
-    data.quadrupoleXX = labFrameQuadrupole[atom*9];
+    data.quadrupoleXX = labFrameQuadrupole[atom*5];
-    data.quadrupoleXY = labFrameQuadrupole[atom*9+1];
+    data.quadrupoleXY = labFrameQuadrupole[atom*5+1];
-    data.quadrupoleXZ = labFrameQuadrupole[atom*9+2];
+    data.quadrupoleXZ = labFrameQuadrupole[atom*5+2];
-    data.quadrupoleYY = labFrameQuadrupole[atom*9+4];
+    data.quadrupoleYY = labFrameQuadrupole[atom*5+3];
-    data.quadrupoleYZ = labFrameQuadrupole[atom*9+5];
+    data.quadrupoleYZ = labFrameQuadrupole[atom*5+4];
-    data.quadrupoleZZ = labFrameQuadrupole[atom*9+8];
+    data.quadrupoleZZ = 1-data.quadrupoleXX-data.quadrupoleYY;
    float2 temp = dampingAndThole[atom];
    data.damp = temp.x;
    data.thole = temp.y;
@@ -38,7 +38,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    float damp = atom1.damp*atom2.damp;
    real dampExp;
-    if (damp != 0 && r < SCALING_DISTANCE_CUTOFF) {
+    if (damp != 0) {
        // get scaling factors
@@ -83,7 +83,7 @@ __device__ real computeDScaleFactor(unsigned int polarizationGroup) {
    return (polarizationGroup & 1 ? 0 : 1);
 }
-__device__ float computeDScaleFactor(uint2 covalent) {
+__device__ float computeMScaleFactor(uint2 covalent) {
 //        int f1 = (value == 0 || value == 1 ? 1 : 0);
 //        int f2 = (value == 0 || value == 2 ? 1 : 0);
    // 0 = 12 or 13: x and y: 0
@@ -106,7 +106,7 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
 */
 extern "C" __global__ void computeFixedField(
        unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
-        const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
 #ifdef USE_CUTOFF
        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
@@ -185,11 +185,9 @@ extern "C" __global__ void computeFixedField(
                localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
                localData[localAtomIndex].thole = data.thole; // IS THIS CORRECT?
                localData[localAtomIndex].damp = data.damp; // IS THIS CORRECT?
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    bool isExcluded = !(excl & 0x1);
                    int atom2 = tbx+j;
                    real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
 #ifdef USE_PERIODIC
@@ -200,13 +198,13 @@ extern "C" __global__ void computeFixedField(
                    real3 field1;
                    real3 field2;
                    computeOneInteraction(data, localData[atom2], delta, field1, field2);
-                    if (!isExcluded) {
+                    atom2 = y*TILE_SIZE+j;
-                        float d = computeDScaleFactor(covalent);
+                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        float d = computeDScaleFactor(polarizationGroup);
                        data.field += d*field1;
                        float p = computePScaleFactor(covalent, polarizationGroup);
                        data.fieldPolar += p*field1;
                    }
-                    excl >>= 1;
                    covalent.x >>= 1;
                    covalent.y >>= 1;
                    polarizationGroup >>= 1;
@@ -231,7 +229,6 @@ extern "C" __global__ void computeFixedField(
                        for (j = 0; j < TILE_SIZE; j++) {
                            if ((flags&(1<<j)) != 0) {
-                                bool isExcluded = false;
                                int atom2 = tbx+j;
                                int bufferIndex = 3*threadIdx.x;
                                real3 dEdR1 = make_real3(0);
@@ -298,16 +295,13 @@ extern "C" __global__ void computeFixedField(
                {
                    // Compute the full set of interactions in this tile.
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
                    uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
                    unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
-                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
                    covalent.x = (covalent.x >> tgx) | (covalent.x << (TILE_SIZE - tgx));
                    covalent.y = (covalent.y >> tgx) | (covalent.y << (TILE_SIZE - tgx));
                    polarizationGroup = (polarizationGroup >> tgx) | (polarizationGroup << (TILE_SIZE - tgx));
                    unsigned int tj = tgx;
                    for (j = 0; j < TILE_SIZE; j++) {
-                        bool isExcluded = !(excl & 0x1);
                        int atom2 = tbx+tj;
                        real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
 #ifdef USE_PERIODIC
@@ -318,15 +312,14 @@ extern "C" __global__ void computeFixedField(
                        real3 field1;
                        real3 field2;
                        computeOneInteraction(data, localData[atom2], delta, field1, field2);
-                        if (!isExcluded) {
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                            float d = computeDScaleFactor(covalent);
+                            float d = computeDScaleFactor(polarizationGroup);
                            data.field += d*field1;
                            localData[atom2].field += d*field2;
                            float p = computePScaleFactor(covalent, polarizationGroup);
                            data.fieldPolar += p*field1;
                            localData[atom2].fieldPolar += p*field2;
                        }
-                        excl >>= 1;
                        covalent.x >>= 1;
                        covalent.y >>= 1;
                        polarizationGroup >>= 1;

--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoles.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoles.cu
@@ -8,10 +8,10 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
    // code common to ZThenX and Bisector
-    for (int particleIndex = blockIdx.x*blockDim.x+threadIdx.x; particleIndex < NUM_ATOMS; particleIndex += gridDim.x*blockDim.x) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += gridDim.x*blockDim.x) {
-        int4 particles = multipoleParticles[particleIndex];
+        int4 particles = multipoleParticles[atom];
        if (particles.x >= 0 && particles.z >= 0) {
-            real4 thisParticlePos = posq[particleIndex];
+            real4 thisParticlePos = posq[atom];
            real4 posZ = posq[particles.z];
            real3 vectorZ = make_real3(posZ.x-thisParticlePos.x, posZ.y-thisParticlePos.y, posZ.z-thisParticlePos.z);
            real4 posX = posq[particles.x];
@@ -149,7 +149,7 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
            // Transform the dipole
-            unsigned int offset = 3*particleIndex;
+            unsigned int offset = 3*atom;
            real molDipole[3];
            molDipole[0] = molecularDipoles[offset];
            molDipole[1] = molecularDipoles[offset+1];
@@ -164,55 +164,47 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
            // Transform the quadrupole
-            real mPole[3][3];
+            offset = 5*atom;
-            offset = 9*particleIndex;
+            real mPoleXX = molecularQuadrupoles[offset];
-            mPole[0][0] = molecularQuadrupoles[offset];
+            real mPoleXY = molecularQuadrupoles[offset+1];
-            mPole[0][1] = molecularQuadrupoles[offset+1];
+            real mPoleXZ = molecularQuadrupoles[offset+2];
-            mPole[0][2] = molecularQuadrupoles[offset+2];
+            real mPoleYY = molecularQuadrupoles[offset+3];
+            real mPoleYZ = molecularQuadrupoles[offset+4];
-            mPole[1][0] = molecularQuadrupoles[offset+3];
+            real mPoleZZ = 1-mPoleXX-mPoleYY;
-            mPole[1][1] = molecularQuadrupoles[offset+4];
-            mPole[1][2] = molecularQuadrupoles[offset+5];
-            mPole[2][0] = molecularQuadrupoles[offset+6];
-            mPole[2][1] = molecularQuadrupoles[offset+7];
-            mPole[2][2] = molecularQuadrupoles[offset+8];
            if (reverse) {
-                mPole[0][1] *= -1;
+                mPoleXY *= -1;
-                mPole[1][0] *= -1;
+                mPoleYZ *= -1;
-                mPole[1][2] *= -1;
-                mPole[2][1] *= -1;
            }
-            labFrameQuadrupoles[offset+8] = vectorX.z*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+            labFrameQuadrupoles[offset] = vectorX.x*(vectorX.x*mPoleXX + vectorY.x*mPoleXY + vectorZ.x*mPoleXZ);
-            labFrameQuadrupoles[offset+8] += vectorY.z*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+                                        + vectorY.x*(vectorX.x*mPoleXY + vectorY.x*mPoleYY + vectorZ.x*mPoleYZ);
-            labFrameQuadrupoles[offset+8] += vectorZ.z*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+                                        + vectorZ.x*(vectorX.x*mPoleXZ + vectorY.x*mPoleYZ + vectorZ.x*mPoleZZ);
+            labFrameQuadrupoles[offset+1] = vectorX.x*(vectorX.y*mPoleXX + vectorY.y*mPoleXY + vectorZ.y*mPoleXZ);
-            labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
+                                        + vectorY.x*(vectorX.y*mPoleXY + vectorY.y*mPoleYY + vectorZ.y*mPoleYZ);
-            labFrameQuadrupoles[offset+4] += vectorY.y*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
+                                        + vectorZ.x*(vectorX.y*mPoleXZ + vectorY.y*mPoleYZ + vectorZ.y*mPoleZZ);
-            labFrameQuadrupoles[offset+4] += vectorZ.y*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
+            labFrameQuadrupoles[offset+2] = vectorX.x*(vectorX.z*mPoleXX + vectorY.z*mPoleXY + vectorZ.z*mPoleXZ);
+                                        + vectorY.x*(vectorX.z*mPoleXY + vectorY.z*mPoleYY + vectorZ.z*mPoleYZ);
-            labFrameQuadrupoles[offset+5] = vectorX.y*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+                                        + vectorZ.x*(vectorX.z*mPoleXZ + vectorY.z*mPoleYZ + vectorZ.z*mPoleZZ);
-            labFrameQuadrupoles[offset+5] += vectorY.y*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+            labFrameQuadrupoles[offset+3] = vectorX.y*(vectorX.y*mPoleXX + vectorY.y*mPoleXY + vectorZ.y*mPoleXZ);
-            labFrameQuadrupoles[offset+5] += vectorZ.y*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+                                        + vectorY.y*(vectorX.y*mPoleXY + vectorY.y*mPoleYY + vectorZ.y*mPoleYZ);
+                                        + vectorZ.y*(vectorX.y*mPoleXZ + vectorY.y*mPoleYZ + vectorZ.y*mPoleZZ);
-            labFrameQuadrupoles[offset] = vectorX.x*(vectorX.x*mPole[0][0] + vectorY.x*mPole[0][1] + vectorZ.x*mPole[0][2]);
+            labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.z*mPoleXX + vectorY.z*mPoleXY + vectorZ.z*mPoleXZ);
-            labFrameQuadrupoles[offset] += vectorY.x*(vectorX.x*mPole[1][0] + vectorY.x*mPole[1][1] + vectorZ.x*mPole[1][2]);
+                                        + vectorY.y*(vectorX.z*mPoleXY + vectorY.z*mPoleYY + vectorZ.z*mPoleYZ);
-            labFrameQuadrupoles[offset] += vectorZ.x*(vectorX.x*mPole[2][0] + vectorY.x*mPole[2][1] + vectorZ.x*mPole[2][2]);
+                                        + vectorZ.y*(vectorX.z*mPoleXZ + vectorY.z*mPoleYZ + vectorZ.z*mPoleZZ);
-            labFrameQuadrupoles[offset+1] = vectorX.x*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
-            labFrameQuadrupoles[offset+1] += vectorY.x*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
-            labFrameQuadrupoles[offset+1] += vectorZ.x*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
-            labFrameQuadrupoles[offset+2] = vectorX.x*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
-            labFrameQuadrupoles[offset+2] += vectorY.x*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
-            labFrameQuadrupoles[offset+2] += vectorZ.x*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
-            labFrameQuadrupoles[offset+3] = labFrameQuadrupoles[offset+1];
-            labFrameQuadrupoles[offset+6] = labFrameQuadrupoles[offset+2];
-            labFrameQuadrupoles[offset+7] = labFrameQuadrupoles[offset+5];
        }
    }
 }
+extern "C" __global__ void recordInducedDipoles(const long long* __restrict__ fieldBuffers, const long long* __restrict__ fieldPolarBuffers,
+        real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += gridDim.x*blockDim.x) {
+        real scale = polarizability[atom]/(real) 0xFFFFFFFF;
+        inducedDipole[3*atom] = scale*fieldBuffers[atom];
+        inducedDipole[3*atom+1] = scale*fieldBuffers[atom+PADDED_NUM_ATOMS];
+        inducedDipole[3*atom+2] = scale*fieldBuffers[atom+PADDED_NUM_ATOMS*2];
+        inducedDipolePolar[3*atom] = scale*fieldPolarBuffers[atom];
+        inducedDipolePolar[3*atom+1] = scale*fieldPolarBuffers[atom+PADDED_NUM_ATOMS];
+        inducedDipolePolar[3*atom+2] = scale*fieldPolarBuffers[atom+PADDED_NUM_ATOMS*2];
+    }
+}
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaVdwForce.cpp
+++ b/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaVdwForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
 * Authors: Mark Friedrichs                                                   *
 * Contributors:                                                              *
 *                                                                            *
@@ -42,6 +42,13 @@
 #include "openmm/LangevinIntegrator.h"
 #include <iostream>
 #include <vector>
+#include <stdlib.h>
+#include <stdio.h>
+#define ASSERT_EQUAL_TOL_MOD(expected, found, tol, testname) {double _scale_ = std::abs(expected) > 1.0 ? std::abs(expected) : 1.0; if (!(std::abs((expected)-(found))/_scale_ <= (tol))) {std::stringstream details; details << testname << " Expected "<<(expected)<<", found "<<(found); throwException(__FILE__, __LINE__, details.str());}};
+#define ASSERT_EQUAL_VEC_MOD(expected, found, tol,testname) {ASSERT_EQUAL_TOL_MOD((expected)[0], (found)[0], (tol),(testname)); ASSERT_EQUAL_TOL_MOD((expected)[1], (found)[1], (tol),(testname)); ASSERT_EQUAL_TOL_MOD((expected)[2], (found)[2], (tol),(testname));};
 using namespace OpenMM;
 const double TOL = 1e-4;
@@ -56,22 +63,20 @@ void testVdw( FILE* log ) {
    std::string epsilonCombiningRule = std::string("HHG");
    amoebaVdwForce->setEpsilonCombiningRule( epsilonCombiningRule );
+    int classIndex = 0;
    for( int ii = 0; ii < numberOfParticles; ii++ ){
-        int indexIV, indexClass;
+        int indexIV;
        double mass, sigma, epsilon, reduction;
        std::vector< int > exclusions;
        if( ii == 0 || ii == 3 ){
            mass        = 16.0;
            indexIV     = ii;
-            indexClass  = 70;
            sigma       = 1.70250E+00;
            epsilon     = 1.10000E-01;
            reduction   = 0.0;
        } else {
            mass        = 1.0;
            indexIV     = ii < 3 ? 0 : 3;
-            indexClass  = 71;
            sigma       = 1.32750E+00;
            epsilon     = 1.35000E-02;
            reduction   = 0.91;
@@ -89,7 +94,7 @@ void testVdw( FILE* log ) {
            exclusions.push_back ( 5 );
        }
        system.addParticle(mass);
-        amoebaVdwForce->addParticle( indexIV, indexClass, sigma, epsilon, reduction );
+        amoebaVdwForce->addParticle( indexIV, classIndex, sigma, epsilon, reduction );
        amoebaVdwForce->setParticleExclusions( ii, exclusions );
    }
    LangevinIntegrator integrator(0.0, 0.1, 0.01);
@@ -130,12 +135,13 @@ void testVdw( FILE* log ) {
        positions[ii][2] *= AngstromToNm;
    }
    for( int ii = 0; ii < amoebaVdwForce->getNumParticles();  ii++ ){
-        int indexIV, indexClass;
+        int indexIV;
+        int classIndex;
        double sigma, epsilon, reduction;
-        amoebaVdwForce->getParticleParameters( ii, indexIV, indexClass, sigma, epsilon, reduction );
+        amoebaVdwForce->getParticleParameters( ii, indexIV, classIndex, sigma, epsilon, reduction );
        sigma        *= AngstromToNm;
        epsilon      *= CalToJoule;
-        amoebaVdwForce->setParticleParameters( ii, indexIV, indexClass, sigma, epsilon, reduction );
+        amoebaVdwForce->setParticleParameters( ii, indexIV, classIndex, sigma, epsilon, reduction );
    }
    platformName = "CUDA";
    Context context(system, integrator, Platform::getPlatformByName( platformName ) );
@@ -170,6 +176,342 @@ void testVdw( FILE* log ) {
    ASSERT_EQUAL_TOL( expectedEnergy, state.getPotentialEnergy(), tolerance );
 }
+void setupAndGetForcesEnergyVdwAmmonia( const std::string& sigmaCombiningRule, const std::string& epsilonCombiningRule, double cutoff,
+                                        double boxDimension, std::vector<Vec3>& forces, double& energy, FILE* log ){
+    // beginning of Vdw setup
+    System system;
+    AmoebaVdwForce* amoebaVdwForce        = new AmoebaVdwForce();;
+    int numberOfParticles                 = 8;
+    amoebaVdwForce->setSigmaCombiningRule( sigmaCombiningRule );
+    amoebaVdwForce->setEpsilonCombiningRule( epsilonCombiningRule );
+    amoebaVdwForce->setUseNeighborList( 1 );
+    amoebaVdwForce->setCutoff( cutoff );
+    if( boxDimension > 0.0 ){
+        Vec3 a( boxDimension, 0.0, 0.0 );
+        Vec3 b( 0.0, boxDimension, 0.0 );
+        Vec3 c( 0.0, 0.0, boxDimension );
+        system.setDefaultPeriodicBoxVectors( a, b, c );
+        amoebaVdwForce->setPBC( 1 );
+    } else {
+        amoebaVdwForce->setPBC( 0 );
+    }
+    // addParticle: ivIndex, radius, epsilon, reductionFactor
+    int classIndex = 0;
+    system.addParticle(   1.4007000e+01 );
+    amoebaVdwForce->addParticle( 0, classIndex,   1.8550000e-01,   4.3932000e-01,   0.0000000e+00 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 0, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 0, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 0, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.4007000e+01 );
+    amoebaVdwForce->addParticle( 4, classIndex,   1.8550000e-01,   4.3932000e-01,   0.0000000e+00 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 4, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 4, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    system.addParticle(   1.0080000e+00 );
+    amoebaVdwForce->addParticle( 4, classIndex,   1.3500000e-01,   8.3680000e-02,   9.1000000e-01 );
+    // ParticleExclusions
+    std::vector< int > exclusions;
+    exclusions.resize(0);
+    exclusions.push_back( 0 );
+    exclusions.push_back( 1 );
+    exclusions.push_back( 2 );
+    exclusions.push_back( 3 );
+    amoebaVdwForce->setParticleExclusions( 0, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 1 );
+    exclusions.push_back( 0 );
+    exclusions.push_back( 2 );
+    exclusions.push_back( 3 );
+    amoebaVdwForce->setParticleExclusions( 1, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 2 );
+    exclusions.push_back( 0 );
+    exclusions.push_back( 1 );
+    exclusions.push_back( 3 );
+    amoebaVdwForce->setParticleExclusions( 2, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 3 );
+    exclusions.push_back( 0 );
+    exclusions.push_back( 1 );
+    exclusions.push_back( 2 );
+    amoebaVdwForce->setParticleExclusions( 3, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 4 );
+    exclusions.push_back( 5 );
+    exclusions.push_back( 6 );
+    exclusions.push_back( 7 );
+    amoebaVdwForce->setParticleExclusions( 4, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 5 );
+    exclusions.push_back( 4 );
+    exclusions.push_back( 6 );
+    exclusions.push_back( 7 );
+    amoebaVdwForce->setParticleExclusions( 5, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 6 );
+    exclusions.push_back( 4 );
+    exclusions.push_back( 5 );
+    exclusions.push_back( 7 );
+    amoebaVdwForce->setParticleExclusions( 6, exclusions );
+    exclusions.resize(0);
+    exclusions.push_back( 7 );
+    exclusions.push_back( 4 );
+    exclusions.push_back( 5 );
+    exclusions.push_back( 6 );
+    amoebaVdwForce->setParticleExclusions( 7, exclusions );
+    // end of Vdw setup
+    std::vector<Vec3> positions(numberOfParticles);
+    positions[0]              = Vec3(   1.5927280e-01,   1.7000000e-06,    1.6491000e-03 );
+    positions[1]              = Vec3(   2.0805540e-01,  -8.1258800e-02,    3.7282500e-02 );
+    positions[2]              = Vec3(   2.0843610e-01,   8.0953200e-02,    3.7462200e-02 );
+    positions[3]              = Vec3(   1.7280780e-01,   2.0730000e-04,   -9.8741700e-02 );
+    positions[4]              = Vec3(  -1.6743680e-01,   1.5900000e-05,   -6.6149000e-03 );
+    positions[5]              = Vec3(  -2.0428260e-01,   8.1071500e-02,    4.1343900e-02 );
+    positions[6]              = Vec3(  -6.7308300e-02,   1.2800000e-05,    1.0623300e-02 );
+    positions[7]              = Vec3(  -2.0426290e-01,  -8.1231400e-02,    4.1033500e-02 );
+    system.addForce(amoebaVdwForce);
+    std::string platformName;
+    platformName = "CUDA";
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName( platformName ) );
+    context.setPositions(positions);
+    State state                      = context.getState(State::Forces | State::Energy);
+    forces                           = state.getForces();
+    energy                           = state.getPotentialEnergy();
+}
+void compareForcesEnergy( std::string& testName, double expectedEnergy, double energy,
+                          std::vector<Vec3>& expectedForces,
+                          std::vector<Vec3>& forces, double tolerance, FILE* log ) {
+#ifdef AMOEBA_DEBUG
+    if( log ){
+        (void) fprintf( log, "%s: expected energy=%14.7e %14.7e\n", testName.c_str(), expectedEnergy, state.getPotentialEnergy() );
+        for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+            (void) fprintf( log, "%6u [%14.7e %14.7e %14.7e]   [%14.7e %14.7e %14.7e]\n", ii,
+                            expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forces[ii][0], forces[ii][1], forces[ii][2] );
+        }
+        (void) fflush( log );
+    }
+#endif
+    for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+        ASSERT_EQUAL_VEC_MOD( expectedForces[ii], forces[ii], tolerance, testName );
+    }
+    ASSERT_EQUAL_TOL_MOD( expectedEnergy, energy, tolerance, testName );
+}
+// test VDW w/ sigmaRule=CubicMean and epsilonRule=HHG
+void testVdwAmmoniaCubicMeanHhg( FILE* log ) {
+    std::string testName      = "testVdwAmmoniaCubicMeanHhg";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "CUBIC-MEAN", "HHG", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  4.8012258e+00;
+    expectedForces[0]         = Vec3(   2.9265247e+02,  -1.4507808e-02,  -6.9562123e+00 );
+    expectedForces[1]         = Vec3(  -2.2451693e+00,   4.8143073e-01,  -2.0041494e-01 );
+    expectedForces[2]         = Vec3(  -2.2440698e+00,  -4.7905450e-01,  -2.0125284e-01 );
+    expectedForces[3]         = Vec3(  -1.0840394e+00,  -5.8531253e-04,   2.6934135e-01 );
+    expectedForces[4]         = Vec3(  -5.6305662e+01,   1.4733908e-03,  -1.8083306e-01 );
+    expectedForces[5]         = Vec3(   1.6750145e+00,  -3.2448374e-01,  -1.8030914e-01 );
+    expectedForces[6]         = Vec3(  -2.3412420e+02,   1.0754069e-02,   7.6287492e+00 );
+    expectedForces[7]         = Vec3(   1.6756544e+00,   3.2497316e-01,  -1.7906832e-01 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+// test VDW w/ sigmaRule=Arithmetic and epsilonRule=Arithmetic
+void testVdwAmmoniaArithmeticArithmetic( FILE* log ) {
+    std::string testName      = "testVdwAmmoniaArithmeticArithmetic";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "ARITHMETIC", "ARITHMETIC", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  4.2252403e+00;
+    expectedForces[0]         = Vec3(   3.0603839e+02,  -1.5550310e-02,  -7.2661707e+00 );
+    expectedForces[1]         = Vec3(  -2.7801357e+00,   5.8805051e-01,  -2.5907269e-01 );
+    expectedForces[2]         = Vec3(  -2.7753968e+00,  -5.8440732e-01,  -2.5969111e-01 );
+    expectedForces[3]         = Vec3(  -2.2496416e+00,  -1.1797440e-03,   5.5501757e-01 );
+    expectedForces[4]         = Vec3(  -5.5077629e+01,   8.3417114e-04,  -3.3668921e-01 );
+    expectedForces[5]         = Vec3(   2.3752452e+00,  -4.6788669e-01,  -2.4907764e-01 );
+    expectedForces[6]         = Vec3(  -2.4790697e+02,   1.1419770e-02,   8.0629999e+00 );
+    expectedForces[7]         = Vec3(   2.3761408e+00,   4.6871961e-01,  -2.4731607e-01 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+// test VDW w/ sigmaRule=Geometric and epsilonRule=Geometric
+void testVdwAmmoniaGeometricGeometric( FILE* log ) {
+    std::string testName      = "testVdwAmmoniaGeometricGeometric";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "GEOMETRIC", "GEOMETRIC", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  2.5249914e+00;
+    expectedForces[0]         = Vec3(   2.1169631e+02,  -1.0710925e-02,  -4.3728025e+00 );
+    expectedForces[1]         = Vec3(  -2.2585621e+00,   4.8409995e-01,  -2.0188344e-01 );
+    expectedForces[2]         = Vec3(  -2.2551351e+00,  -4.8124855e-01,  -2.0246986e-01 );
+    expectedForces[3]         = Vec3(  -1.7178028e+00,  -9.0851787e-04,   4.2466975e-01 );
+    expectedForces[4]         = Vec3(  -4.8302147e+01,   9.6603376e-04,  -5.7972068e-01 );
+    expectedForces[5]         = Vec3(   1.8100634e+00,  -3.5214093e-01,  -1.9357207e-01 );
+    expectedForces[6]         = Vec3(  -1.6078365e+02,   7.2117601e-03,   5.3180261e+00 );
+    expectedForces[7]         = Vec3(   1.8109211e+00,   3.5273117e-01,  -1.9224723e-01 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+void testVdwAmmoniaCubicMeanHarmonic( FILE* log ) {
+    std::string testName      = "testVdwAmmoniaCubicMeanHarmonic";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "CUBIC-MEAN", "HARMONIC", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  4.1369069e+00;
+    expectedForces[0]         = Vec3(   2.5854436e+02,  -1.2779529e-02,  -5.9041148e+00 );
+    expectedForces[1]         = Vec3(  -2.0832419e+00,   4.4915831e-01,  -1.8266000e-01 );
+    expectedForces[2]         = Vec3(  -2.0823991e+00,  -4.4699804e-01,  -1.8347141e-01 );
+    expectedForces[3]         = Vec3(  -9.5914714e-01,  -5.2162026e-04,   2.3873165e-01 );
+    expectedForces[4]         = Vec3(  -5.3724787e+01,   1.4838241e-03,  -2.8089191e-01 );
+    expectedForces[5]         = Vec3(   1.5074325e+00,  -2.9016397e-01,  -1.6385118e-01 );
+    expectedForces[6]         = Vec3(  -2.0271029e+02,   9.2367947e-03,   6.6389988e+00 );
+    expectedForces[7]         = Vec3(   1.5080748e+00,   2.9058422e-01,  -1.6274118e-01 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+// test w/ cutoff=0.25 nm; single ixn between two particles (0 and 6); force nonzero on
+// particle 4 due to reduction applied to NH
+// the distance between 0 and 6 is ~ 0.235 so the ixn is in the tapered region
+void testVdwTaper( FILE* log ) {
+    std::string testName      = "testVdwTaper";
+    int numberOfParticles     = 8;
+    double boxDimension       = -1.0;
+    double cutoff             = 0.25;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "CUBIC-MEAN", "HHG", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  3.5478444e+00;
+    expectedForces[0]         = Vec3(   5.6710779e+02,  -2.7391004e-02,  -1.7867730e+01 );
+    expectedForces[1]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[2]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[3]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[4]         = Vec3(  -5.1039701e+01,   2.4651903e-03,   1.6080957e+00 );
+    expectedForces[5]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[6]         = Vec3(  -5.1606809e+02,   2.4925813e-02,   1.6259634e+01 );
+    expectedForces[7]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
+// test PBC
+void testVdwPBC( FILE* log ) {
+    std::string testName      = "testVdwPBC";
+    int numberOfParticles     = 8;
+    double boxDimension       = 0.6;
+    double cutoff             = 0.25;
+    std::vector<Vec3> forces;
+    double energy;
+    setupAndGetForcesEnergyVdwAmmonia( "CUBIC-MEAN", "HHG", cutoff, boxDimension, forces, energy, log );
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy     =  8.4385405e+00;
+    expectedForces[0]         = Vec3(   5.1453069e+02,   4.9751912e-01,  -1.2759570e+01 );
+    expectedForces[1]         = Vec3(  -2.5622586e+02,  -4.6524265e+01,   2.4281465e+01 );
+    expectedForces[2]         = Vec3(  -2.7538705e+02,   5.1831690e+01,   2.7367710e+01 );
+    expectedForces[3]         = Vec3(  -0.0000000e+00,  -0.0000000e+00,  -0.0000000e+00 );
+    expectedForces[4]         = Vec3(   3.0883034e+02,  -5.8876974e+00,  -5.8286122e+01 );
+    expectedForces[5]         = Vec3(   1.1319359e+02,  -3.2047069e-01,   1.6181231e+00 );
+    expectedForces[6]         = Vec3(  -5.1606809e+02,   2.4925813e-02,   1.6259634e+01 );
+    expectedForces[7]         = Vec3(   1.1112638e+02,   3.7829857e-01,   1.5187587e+00 );
+    // tolerance is higher here due to interpolation used in setting tapering coefficients;
+    // if tapering turned off, then absolute difference < 2.0e-05
+    double tolerance          = 5.0e-04;
+    compareForcesEnergy( testName, expectedEnergy, energy, expectedForces, forces, tolerance, log );
+}
 int main( int numberOfArguments, char* argv[] ) {
@@ -178,7 +520,37 @@ int main( int numberOfArguments, char* argv[] ) {
        registerAmoebaCudaKernelFactories();
        FILE* log = NULL;
        testVdw( log );
+        // tests using two ammonia molecules
+        // test VDW w/ sigmaRule=CubicMean and epsilonRule=HHG
+        testVdwAmmoniaCubicMeanHhg( log );
+        // test VDW w/ sigmaRule=Arithmetic and epsilonRule=Arithmetic
+        testVdwAmmoniaArithmeticArithmetic( log );
+        // test VDW w/ sigmaRule=Geometric and epsilonRule=Geometric
+        testVdwAmmoniaGeometricGeometric( log );
+        // test VDW w/ sigmaRule=CubicMean and epsilonRule=Harmonic
+        testVdwAmmoniaCubicMeanHarmonic( log );
+        // test w/ cutoff=0.25 nm; single ixn between two particles (0 and 6); force nonzero on
+        // particle 4 due to reduction applied to NH
+        // the distance between 0 and 6 is ~ 0.235 so the ixn is in the tapered region
+        testVdwTaper( log );
+        // test PBC
+        testVdwPBC( log );
    } catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;
        std::cout << "FAIL - ERROR.  Test failed." << std::endl;