Continuing CUDA implementation of extrapolated polarization

c2ea6c28 · Peter Eastman · a4e2d9a6 · c2ea6c28 · c2ea6c28 · c2ea6c28
Commit c2ea6c28 authored Jan 07, 2016 by Peter Eastman
7 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1019,10 +1019,10 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        int numOrders = force.getExtrapolationCoefficients().size();
        extrapolatedDipole = new CudaArray(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipole");
        extrapolatedDipolePolar = new CudaArray(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipolePolar");
-        inducedDipoleFieldGradient = new CudaArray(cu, 6*numMultipoles, elementSize, "inducedDipoleFieldGradient");
+        inducedDipoleFieldGradient = new CudaArray(cu, 6*paddedNumAtoms, sizeof(long long), "inducedDipoleFieldGradient");
-        inducedDipoleFieldGradientPolar = new CudaArray(cu, 6*numMultipoles, elementSize, "inducedDipoleFieldGradientPolar");
+        inducedDipoleFieldGradientPolar = new CudaArray(cu, 6*paddedNumAtoms, sizeof(long long), "inducedDipoleFieldGradientPolar");
-        extrapolatedDipoleFieldGradient = new CudaArray(cu, 6*numMultipoles*numOrders, elementSize, "extrapolatedDipoleFieldGradient");
+        extrapolatedDipoleFieldGradient = new CudaArray(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradient");
-        extrapolatedDipoleFieldGradientPolar = new CudaArray(cu, 6*numMultipoles*numOrders, elementSize, "extrapolatedDipoleFieldGradientPolar");
+        extrapolatedDipoleFieldGradientPolar = new CudaArray(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientPolar");
    }
    cu.addAutoclearBuffer(*field);
    cu.addAutoclearBuffer(*fieldPolar);
@@ -1109,6 +1109,10 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456/innerDielectric);
    if (polarizationType == AmoebaMultipoleForce::Direct)
        defines["DIRECT_POLARIZATION"] = "";
+    else if (polarizationType == AmoebaMultipoleForce::Mutual)
+        defines["MUTUAL_POLARIZATION"] = "";
+    else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
+        defines["EXTRAPOLATED_POLARIZATION"] = "";
    if (useShuffle)
        defines["USE_SHUFFLE"] = "";
    if (hasQuadrupoles)
@@ -1129,7 +1133,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            coefficients << ",";
        double sum = 0;
        for (int j = i; j < maxExtrapolationOrder; j++)
-            sum = force.getExtrapolationCoefficients()[j];
+            sum += force.getExtrapolationCoefficients()[j];
        coefficients << cu.doubleToString(sum);
    }
    defines["EXTRAPOLATION_COEFFICIENTS_SUM"] = coefficients.str();
@@ -1176,8 +1180,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            extrapolatedDipoleGkPolar = new CudaArray(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipoleGkPolar");
            inducedDipoleFieldGradientGk = new CudaArray(cu, 6*numMultipoles, elementSize, "inducedDipoleFieldGradientGk");
            inducedDipoleFieldGradientGkPolar = new CudaArray(cu, 6*numMultipoles, elementSize, "inducedDipoleFieldGradientGkPolar");
-            extrapolatedDipoleFieldGradientGk = new CudaArray(cu, 6*numMultipoles*numOrders, elementSize, "extrapolatedDipoleFieldGradientGk");
+            extrapolatedDipoleFieldGradientGk = new CudaArray(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientGk");
-            extrapolatedDipoleFieldGradientGkPolar = new CudaArray(cu, 6*numMultipoles*numOrders, elementSize, "extrapolatedDipoleFieldGradientGkPolar");
+            extrapolatedDipoleFieldGradientGkPolar = new CudaArray(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientGkPolar");
        }
    }
    int maxThreads = cu.getNonbondedUtilities().getForceThreadBlockSize();
@@ -1203,6 +1207,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
        iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
        computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
+        addExtrapolatedGradientKernel = cu.getKernel(module, "addExtrapolatedFieldGradientToForce");
    }
    stringstream electrostaticsSource;
    electrostaticsSource << CudaKernelSources::vectorOps;
@@ -1660,6 +1665,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms());
    }
+    // If using extrapolated polarization, add in force contributions from µ(m) T µ(n).
+    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+        void* extrapolatedArgs[] = {&cu.getForce().getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+            &extrapolatedDipolePolar->getDevicePointer(), &extrapolatedDipoleFieldGradient->getDevicePointer(), &extrapolatedDipoleFieldGradientPolar->getDevicePointer()};
+        cu.executeKernel(addExtrapolatedGradientKernel, extrapolatedArgs, numMultipoles);
+    }
    // Map torques to force.
    void* mapTorqueArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
@@ -1678,37 +1691,63 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
-    if (pmeGrid == NULL) {
+    unsigned int maxTiles = 0;
+    vector<void*> computeInducedFieldArgs;
+    computeInducedFieldArgs.push_back(&inducedField->getDevicePointer());
+    computeInducedFieldArgs.push_back(&inducedFieldPolar->getDevicePointer());
+    computeInducedFieldArgs.push_back(&cu.getPosq().getDevicePointer());
+    computeInducedFieldArgs.push_back(&nb.getExclusionTiles().getDevicePointer());
+    computeInducedFieldArgs.push_back(&inducedDipole->getDevicePointer());
+    computeInducedFieldArgs.push_back(&inducedDipolePolar->getDevicePointer());
+    computeInducedFieldArgs.push_back(&startTileIndex);
+    computeInducedFieldArgs.push_back(&numTileIndices);
+    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+        computeInducedFieldArgs.push_back(&inducedDipoleFieldGradient->getDevicePointer());
+        computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientPolar->getDevicePointer());
+    }
+    if (pmeGrid != NULL) {
+        computeInducedFieldArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
+        computeInducedFieldArgs.push_back(&nb.getInteractionCount().getDevicePointer());
+        computeInducedFieldArgs.push_back(cu.getPeriodicBoxSizePointer());
+        computeInducedFieldArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecXPointer());
+        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecYPointer());
+        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecZPointer());
+        computeInducedFieldArgs.push_back(&maxTiles);
+        computeInducedFieldArgs.push_back(&nb.getBlockCenters().getDevicePointer());
+        computeInducedFieldArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
+    }
+    if (gkKernel != NULL) {
+        computeInducedFieldArgs.push_back(&gkKernel->getInducedField()->getDevicePointer());
+        computeInducedFieldArgs.push_back(&gkKernel->getInducedFieldPolar()->getDevicePointer());
+        computeInducedFieldArgs.push_back(&gkKernel->getInducedDipoles()->getDevicePointer());
+        computeInducedFieldArgs.push_back(&gkKernel->getInducedDipolesPolar()->getDevicePointer());
+        computeInducedFieldArgs.push_back(&gkKernel->getBornRadii()->getDevicePointer());
+        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+            computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientGk->getDevicePointer());
+            computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientGkPolar->getDevicePointer());
+        }
+    }
+    computeInducedFieldArgs.push_back(&dampingAndThole->getDevicePointer());
    cu.clearBuffer(*inducedField);
    cu.clearBuffer(*inducedFieldPolar);
-        if (gkKernel == NULL) {
+    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
-            void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
+        cu.clearBuffer(*inducedDipoleFieldGradient);
-                &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
+        cu.clearBuffer(*inducedDipoleFieldGradientPolar);
-                &dampingAndThole->getDevicePointer()};
-            cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
    }
-        else {
+    if (gkKernel != NULL) {
        cu.clearBuffer(*gkKernel->getInducedField());
        cu.clearBuffer(*gkKernel->getInducedFieldPolar());
-            void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
+        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
-                &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
+            cu.clearBuffer(*inducedDipoleFieldGradientGk);
-                &gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(),
+            cu.clearBuffer(*inducedDipoleFieldGradientGkPolar);
-                &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
-                &gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()};
-            cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
        }
    }
+    if (pmeGrid == NULL)
+        cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
    else {
-        cu.clearBuffer(*inducedField);
+        maxTiles = nb.getInteractingTiles().getSize();
-        cu.clearBuffer(*inducedFieldPolar);
+        cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
-        unsigned int maxTiles = nb.getInteractingTiles().getSize();
-        void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-            &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
-            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
-            cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
-            &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
-            &dampingAndThole->getDevicePointer()};
-        cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
        cu.clearBuffer(*pmeGrid);
        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
@@ -1859,17 +1898,6 @@ void CudaCalcAmoebaMultipoleForceKernel::computeExtrapolatedDipoles(void** recip
        cu.executeKernel(iterateExtrapolatedKernel, iterateArgs, extrapolatedDipole->getSize());
    }
-    cout << "CUDA"<< endl;
-    vector<float> d;
-    extrapolatedDipole->download(d);
-    for (int i = 0; i < maxExtrapolationOrder; i++) {
-        cout << "order "<<i<< endl;
-        for (int j = 0; j < numMultipoles; j++) {
-            int k = 3*(j+i*numMultipoles);
-            cout << d[k]<<" "<<d[k+1]<<" "<<d[k+2]<< endl;
-        }
-    }
    // Take a linear combination of the µ_(n) components to form the total dipole
    void* computeArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &extrapolatedDipole->getDevicePointer(),

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -459,7 +459,7 @@ private:
    CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
    CUfunction recordDIISDipolesKernel, buildMatrixKernel;
-    CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel;
+    CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
    CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
    CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
    static const int PmeOrder = 5;

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
@@ -3,9 +3,15 @@
 typedef struct {
    real3 pos;
    real3 field, fieldPolar, inducedDipole, inducedDipolePolar;
+#ifdef EXTRAPOLATED_POLARIZATION
+    real fieldGradient[6], fieldGradientPolar[6];
+#endif
 #ifdef USE_GK
    real3 fieldS, fieldPolarS, inducedDipoleS, inducedDipolePolarS;
    real bornRadius;
+    #ifdef EXTRAPOLATED_POLARIZATION
+        real fieldGradientS[6], fieldGradientPolarS[6];
+    #endif
 #endif
    float thole, damp;
 } AtomData;
@@ -47,6 +53,67 @@ inline __device__ void zeroAtomData(AtomData& data) {
    data.fieldS = make_real3(0);
    data.fieldPolarS = make_real3(0);
 #endif
+#ifdef EXTRAPOLATED_POLARIZATION
+    for (int i = 0; i < 6; i++) {
+        data.fieldGradient[i] = 0;
+        data.fieldGradientPolar[i] = 0;
+#ifdef USE_GK
+        data.fieldGradientS[i] = 0;
+        data.fieldGradientPolarS[i] = 0;
+#endif
+    }
+#endif
+}
+#ifdef EXTRAPOLATED_POLARIZATION
+    #ifdef USE_GK
+        #define SAVE_ATOM_DATA(index, data) saveAtomData(index, data, field, fieldPolar, fieldGradient, fieldGradientPolar, fieldS, fieldPolarS, fieldGradientS, fieldGradientPolarS);
+    #else
+        #define SAVE_ATOM_DATA(index, data) saveAtomData(index, data, field, fieldPolar, fieldGradient, fieldGradientPolar);
+    #endif
+#else
+    #ifdef USE_GK
+        #define SAVE_ATOM_DATA(index, data) saveAtomData(index, data, field, fieldPolar, fieldS, fieldPolarS);
+    #else
+        #define SAVE_ATOM_DATA(index, data) saveAtomData(index, data, field, fieldPolar);
+    #endif
+#endif
+inline __device__ void saveAtomData(int index, AtomData& data, unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar
+#ifdef EXTRAPOLATED_POLARIZATION
+        , unsigned long long* __restrict__ fieldGradient, unsigned long long* __restrict__ fieldGradientPolar
+#endif
+#ifdef USE_GK
+        , unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS
+    #ifdef EXTRAPOLATED_POLARIZATION
+        , unsigned long long* __restrict__ fieldGradientS, unsigned long long* __restrict__ fieldGradientPolarS
+    #endif
+#endif
+        ) {
+    atomicAdd(&field[index], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
+    atomicAdd(&field[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
+    atomicAdd(&field[index+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
+    atomicAdd(&fieldPolar[index], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
+    atomicAdd(&fieldPolar[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
+    atomicAdd(&fieldPolar[index+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
+#ifdef USE_GK
+    atomicAdd(&fieldS[index], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
+    atomicAdd(&fieldS[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
+    atomicAdd(&fieldS[index+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
+    atomicAdd(&fieldPolarS[index], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
+    atomicAdd(&fieldPolarS[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
+    atomicAdd(&fieldPolarS[index+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
+#endif
+#ifdef EXTRAPOLATED_POLARIZATION
+    for (int i = 0; i < 6; i++) {
+        atomicAdd(&fieldGradient[6*index+i], static_cast<unsigned long long>((long long) (data.fieldGradient[i]*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*index+i], static_cast<unsigned long long>((long long) (data.fieldGradientPolar[i]*0x100000000)));
+#ifdef USE_GK
+        atomicAdd(&fieldGradientS[6*index+i], static_cast<unsigned long long>((long long) (data.fieldGradientS[i]*0x100000000)));
+        atomicAdd(&fieldGradientPolarS[6*index+i], static_cast<unsigned long long>((long long) (data.fieldGradientPolarS[i]*0x100000000)));
+#endif
+    }
+#endif
 }
 #ifdef USE_EWALD
@@ -182,6 +249,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    real r2I = rI*rI;
    real rr3 = -rI*r2I;
    real rr5 = -3*rr3*r2I;
+    real rr7 = 5*rr5*r2I;
    real dampProd = atom1.damp*atom2.damp;
    real ratio = (dampProd != 0 ? r/dampProd : 1);
    float pGamma = (atom2.thole > atom1.thole ? atom1.thole: atom2.thole);
@@ -189,6 +257,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    real dampExp = (dampProd != 0 ? EXP(-damp) : 0); 
    rr3 *= 1 - dampExp;
    rr5 *= 1 - (1+damp)*dampExp;
+    rr7 *= 1 - (1+damp+(0.6f*damp*damp))*dampExp;
    real dDotDelta = rr5*dot(deltaR, atom2.inducedDipole);
    atom1.field += rr3*atom2.inducedDipole + dDotDelta*deltaR;
    dDotDelta = rr5*dot(deltaR, atom2.inducedDipolePolar);
@@ -197,6 +266,45 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    atom2.field += rr3*atom1.inducedDipole + dDotDelta*deltaR;
    dDotDelta = rr5*dot(deltaR, atom1.inducedDipolePolar);
    atom2.fieldPolar += rr3*atom1.inducedDipolePolar + dDotDelta*deltaR;
+#ifdef EXTRAPOLATED_POLARIZATION
+    // Compute and store the field gradients for later use.
+    real3 dipole = atom1.inducedDipole;
+    real muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom2.fieldGradient[0] -= muDotR*deltaR.x*deltaR.x*rr7 - (2*dipole.x*deltaR.x + muDotR)*rr5;
+    atom2.fieldGradient[1] -= muDotR*deltaR.y*deltaR.y*rr7 - (2*dipole.y*deltaR.y + muDotR)*rr5;
+    atom2.fieldGradient[2] -= muDotR*deltaR.z*deltaR.z*rr7 - (2*dipole.z*deltaR.z + muDotR)*rr5;
+    atom2.fieldGradient[3] -= muDotR*deltaR.x*deltaR.y*rr7 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*rr5;
+    atom2.fieldGradient[4] -= muDotR*deltaR.x*deltaR.z*rr7 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*rr5;
+    atom2.fieldGradient[5] -= muDotR*deltaR.y*deltaR.z*rr7 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*rr5;
+    dipole = atom1.inducedDipolePolar;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom2.fieldGradientPolar[0] -= muDotR*deltaR.x*deltaR.x*rr7 - (2*dipole.x*deltaR.x + muDotR)*rr5;
+    atom2.fieldGradientPolar[1] -= muDotR*deltaR.y*deltaR.y*rr7 - (2*dipole.y*deltaR.y + muDotR)*rr5;
+    atom2.fieldGradientPolar[2] -= muDotR*deltaR.z*deltaR.z*rr7 - (2*dipole.z*deltaR.z + muDotR)*rr5;
+    atom2.fieldGradientPolar[3] -= muDotR*deltaR.x*deltaR.y*rr7 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*rr5;
+    atom2.fieldGradientPolar[4] -= muDotR*deltaR.x*deltaR.z*rr7 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*rr5;
+    atom2.fieldGradientPolar[5] -= muDotR*deltaR.y*deltaR.z*rr7 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*rr5;
+    dipole = atom2.inducedDipole;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom1.fieldGradient[0] += muDotR*deltaR.x*deltaR.x*rr7 - (2*dipole.x*deltaR.x + muDotR)*rr5;
+    atom1.fieldGradient[1] += muDotR*deltaR.y*deltaR.y*rr7 - (2*dipole.y*deltaR.y + muDotR)*rr5;
+    atom1.fieldGradient[2] += muDotR*deltaR.z*deltaR.z*rr7 - (2*dipole.z*deltaR.z + muDotR)*rr5;
+    atom1.fieldGradient[3] += muDotR*deltaR.x*deltaR.y*rr7 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*rr5;
+    atom1.fieldGradient[4] += muDotR*deltaR.x*deltaR.z*rr7 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*rr5;
+    atom1.fieldGradient[5] += muDotR*deltaR.y*deltaR.z*rr7 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*rr5;
+    dipole = atom2.inducedDipolePolar;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom1.fieldGradientPolar[0] += muDotR*deltaR.x*deltaR.x*rr7 - (2*dipole.x*deltaR.x + muDotR)*rr5;
+    atom1.fieldGradientPolar[1] += muDotR*deltaR.y*deltaR.y*rr7 - (2*dipole.y*deltaR.y + muDotR)*rr5;
+    atom1.fieldGradientPolar[2] += muDotR*deltaR.z*deltaR.z*rr7 - (2*dipole.z*deltaR.z + muDotR)*rr5;
+    atom1.fieldGradientPolar[3] += muDotR*deltaR.x*deltaR.y*rr7 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*rr5;
+    atom1.fieldGradientPolar[4] += muDotR*deltaR.x*deltaR.z*rr7 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*rr5;
+    atom1.fieldGradientPolar[5] += muDotR*deltaR.y*deltaR.z*rr7 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*rr5;
+#endif
 }
 #endif
@@ -206,12 +314,18 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
 extern "C" __global__ void computeInducedField(
        unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq, const ushort2* __restrict__ exclusionTiles, 
        const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices,
+#ifdef EXTRAPOLATED_POLARIZATION
+        unsigned long long* __restrict__ fieldGradient, unsigned long long* __restrict__ fieldGradientPolar,
+#endif
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
 #elif defined USE_GK
        unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS,
        const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii,
+    #ifdef EXTRAPOLATED_POLARIZATION
+        unsigned long long* __restrict__ fieldGradientS, unsigned long long* __restrict__ fieldGradientPolarS,
+    #endif
 #endif
        const float2* __restrict__ dampingAndThole) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
@@ -284,36 +398,10 @@ extern "C" __global__ void computeInducedField(
        // Write results.
        unsigned int offset = x*TILE_SIZE + tgx;
-        atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
+        SAVE_ATOM_DATA(offset, data)
-        atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
-        atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
-        atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
-        atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
-        atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
-#ifdef USE_GK
-        atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
-        atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
-        atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
-        atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
-        atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
-        atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
-#endif
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
+            SAVE_ATOM_DATA(offset, localData[threadIdx.x])
-            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
-            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
-            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
-            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
-            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
-#ifdef USE_GK
-            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.x*0x100000000)));
-            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.y*0x100000000)));
-            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.z*0x100000000)));
-            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.x*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.y*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.z*0x100000000)));
-#endif
        }
    }
@@ -412,39 +500,13 @@ extern "C" __global__ void computeInducedField(
            // Write results.
            unsigned int offset = x*TILE_SIZE + tgx;
-            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
+            SAVE_ATOM_DATA(offset, data)
-            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
-            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
-            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
-            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
-            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
-#ifdef USE_GK
-            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
-            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
-            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
-            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
-#endif
 #ifdef USE_CUTOFF
            offset = atomIndices[threadIdx.x];
 #else
            offset = y*TILE_SIZE + tgx;
 #endif
-            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
+            SAVE_ATOM_DATA(offset, localData[threadIdx.x])
-            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
-            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
-            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
-            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
-            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
-#ifdef USE_GK
-            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.x*0x100000000)));
-            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.y*0x100000000)));
-            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.z*0x100000000)));
-            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.x*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.y*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.z*0x100000000)));
-#endif
        }
        pos++;
    }
@@ -609,7 +671,7 @@ extern "C" __global__ void updateInducedFieldByDIIS(real* __restrict__ inducedDi
 }
 extern "C" __global__ void initExtrapolatedDipoles(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, real* __restrict__ extrapolatedDipole,
-        real* __restrict__ extrapolatedDipolePolar, real* __restrict__ inducedDipoleFieldGradient, real* __restrict__ inducedDipoleFieldGradientPolar
+        real* __restrict__ extrapolatedDipolePolar, long long* __restrict__ inducedDipoleFieldGradient, long long* __restrict__ inducedDipoleFieldGradientPolar
 #ifdef USE_GK
        , real* __restrict__ inducedDipoleGk, real* __restrict__ inducedDipoleGkPolar, real* __restrict__ extrapolatedDipoleGk, real* __restrict__ extrapolatedDipoleGkPolar,
        real* __restrict__ inducedDipoleFieldGradientGk, real* __restrict__ inducedDipoleFieldGradientGkPolar
@@ -634,7 +696,7 @@ extern "C" __global__ void initExtrapolatedDipoles(real* __restrict__ inducedDip
 }
 extern "C" __global__ void iterateExtrapolatedDipoles(int order, real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, real* __restrict__ extrapolatedDipole,
-        real* __restrict__ extrapolatedDipolePolar, real* __restrict__ inducedDipoleFieldGradient, real* __restrict__ inducedDipoleFieldGradientPolar,
+        real* __restrict__ extrapolatedDipolePolar, long long* __restrict__ inducedDipoleFieldGradient, long long* __restrict__ inducedDipoleFieldGradientPolar,
        long long* __restrict__ inducedDipoleField, long long* __restrict__ inducedDipoleFieldPolar, real* __restrict__ extrapolatedDipoleFieldGradient, real* __restrict__ extrapolatedDipoleFieldGradientPolar,
 #ifdef USE_GK
        real* __restrict__ inducedDipoleGk, real* __restrict__ inducedDipoleGkPolar, real* __restrict__ extrapolatedDipoleGk, real* __restrict__ extrapolatedDipoleGkPolar,
@@ -650,7 +712,6 @@ extern "C" __global__ void iterateExtrapolatedDipoles(int order, real* __restric
        float polar = polarizability[atom];
        real value = inducedDipoleField[fieldIndex]*fieldScale*polar;
        inducedDipole[index] = value;
-        printf("%d %d %g %g\n", order, index, inducedDipoleField[fieldIndex]*fieldScale, value);
        extrapolatedDipole[order*3*NUM_ATOMS+index] = value;
        value = inducedDipoleFieldPolar[fieldIndex]*fieldScale*polar;
        inducedDipolePolar[index] = value;
@@ -665,11 +726,12 @@ extern "C" __global__ void iterateExtrapolatedDipoles(int order, real* __restric
 #endif
    }
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < 6*NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        extrapolatedDipoleFieldGradient[order*6*NUM_ATOMS+index] = inducedDipoleFieldGradient[index];
+        int index2 = (order-1)*6*NUM_ATOMS+index;
-        extrapolatedDipoleFieldGradientPolar[order*6*NUM_ATOMS+index] = inducedDipoleFieldGradientPolar[index];
+        extrapolatedDipoleFieldGradient[index2] = fieldScale*inducedDipoleFieldGradient[index];
+        extrapolatedDipoleFieldGradientPolar[index2] = fieldScale*inducedDipoleFieldGradientPolar[index];
 #ifdef USE_GK
-        extrapolatedDipoleFieldGradientGk[order*6*NUM_ATOMS+index] = inducedDipoleFieldGradientGk[index];
+        extrapolatedDipoleFieldGradientGk[index2] = fieldScale*inducedDipoleFieldGradientGk[index];
-        extrapolatedDipoleFieldGradientGkPolar[order*6*NUM_ATOMS+index] = inducedDipoleFieldGradientGkPolar[index];
+        extrapolatedDipoleFieldGradientGkPolar[index2] = fieldScale*inducedDipoleFieldGradientGkPolar[index];
 #endif
    }
 }
@@ -699,3 +761,37 @@ extern "C" __global__ void computeExtrapolatedDipoles(real* __restrict__ induced
 #endif
    }
 }
+extern "C" __global__ void addExtrapolatedFieldGradientToForce(long long* __restrict__ forceBuffers, real* __restrict__ extrapolatedDipole,
+        real* __restrict__ extrapolatedDipolePolar, real* __restrict__ extrapolatedDipoleFieldGradient, real* __restrict__ extrapolatedDipoleFieldGradientPolar
+#ifdef USE_GK
+        , real* __restrict__ extrapolatedDipoleGk, real* __restrict__ extrapolatedDipoleGkPolar,
+        real* __restrict__ extrapolatedDipoleFieldGradientGk, real* __restrict__ extrapolatedDipoleFieldGradientGkPolar
+#endif
+        ) {
+    real coeff[] = {EXTRAPOLATION_COEFFICIENTS_SUM};
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        for (int l = 0; l < MAX_EXTRAPOLATION_ORDER-1; l++) {
+            int index1 = 3*(l*NUM_ATOMS+atom);
+            real dipole[] = {extrapolatedDipole[index1], extrapolatedDipole[index1+1], extrapolatedDipole[index1+2]};
+            real dipolePolar[] = {extrapolatedDipolePolar[index1], extrapolatedDipolePolar[index1+1], extrapolatedDipolePolar[index1+2]};
+            for (int m = 0; m < MAX_EXTRAPOLATION_ORDER-1-l; m++) {
+                int index2 = 6*(m*NUM_ATOMS+atom);
+                real gradient[] = {extrapolatedDipoleFieldGradient[index2], extrapolatedDipoleFieldGradient[index2+1], extrapolatedDipoleFieldGradient[index2+2],
+                                   extrapolatedDipoleFieldGradient[index2+3], extrapolatedDipoleFieldGradient[index2+4], extrapolatedDipoleFieldGradient[index2+5]};
+                real gradientPolar[] = {extrapolatedDipoleFieldGradientPolar[index2], extrapolatedDipoleFieldGradientPolar[index2+1], extrapolatedDipoleFieldGradientPolar[index2+2],
+                                        extrapolatedDipoleFieldGradientPolar[index2+3], extrapolatedDipoleFieldGradientPolar[index2+4], extrapolatedDipoleFieldGradientPolar[index2+5]};
+                real scale = 0.5f*coeff[l+m+1]*ENERGY_SCALE_FACTOR;
+                real fx = scale*(dipole[0]*gradientPolar[0] + dipole[1]*gradientPolar[3] + dipole[2]*gradientPolar[4]);
+                real fy = scale*(dipole[0]*gradientPolar[3] + dipole[1]*gradientPolar[1] + dipole[2]*gradientPolar[5]);
+                real fz = scale*(dipole[0]*gradientPolar[4] + dipole[1]*gradientPolar[5] + dipole[2]*gradientPolar[2]);
+                fx += scale*(dipolePolar[0]*gradient[0] + dipolePolar[1]*gradient[3] + dipolePolar[2]*gradient[4]);
+                fy += scale*(dipolePolar[0]*gradient[3] + dipolePolar[1]*gradient[1] + dipolePolar[2]*gradient[5]);
+                fz += scale*(dipolePolar[0]*gradient[4] + dipolePolar[1]*gradient[5] + dipolePolar[2]*gradient[2]);
+                forceBuffers[atom] += (long long) (fx*0x100000000);
+                forceBuffers[atom+PADDED_NUM_ATOMS] += (long long) (fy*0x100000000);
+                forceBuffers[atom+PADDED_NUM_ATOMS*2] += (long long) (fz*0x100000000);
+            }
+        }
+    }
+}
--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaExtrapolatedPolarization.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaExtrapolatedPolarization.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMMAmoeba                             *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,  *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CUDA implementation of the extrapolated polarization algorithms in AmoebaMultipoleForce.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "OpenMMAmoeba.h"
+#include "openmm/System.h"
+#include "openmm/AmoebaMultipoleForce.h"
+#include "openmm/LangevinIntegrator.h"
+#include "openmm/Vec3.h"
+#include "openmm/Units.h"
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <stdlib.h>
+#include <stdio.h>
+#define ASSERT_EQUAL_TOL_MOD(expected, found, tol, testname) {double _scale_ = std::abs(expected) > 1.0 ? std::abs(expected) : 1.0; if (!(std::abs((expected)-(found))/_scale_ <= (tol))) {std::stringstream details; details << testname << " Expected "<<(expected)<<", found "<<(found); throwException(__FILE__, __LINE__, details.str());}};
+#define ASSERT_EQUAL_VEC_MOD(expected, found, tol,testname) {ASSERT_EQUAL_TOL_MOD((expected)[0], (found)[0], (tol),(testname)); ASSERT_EQUAL_TOL_MOD((expected)[1], (found)[1], (tol),(testname)); ASSERT_EQUAL_TOL_MOD((expected)[2], (found)[2], (tol),(testname));};
+using namespace OpenMM;
+using namespace std;
+extern "C" void registerAmoebaCudaKernelFactories();
+const double TOL = 1e-4;
+// print the energy and forces out, in AKMA units, to allow comparison with TINKER
+static void printEnergyAndForces(double energy, vector<Vec3> &forces){
+    size_t natoms = forces.size();
+    double sf = 1.0;
+    std::cout << "Energy (SI):" << std::setw(20) << std::setprecision(10) << energy << std::endl;
+    std::cout << "Forces (SI):" << std::endl;
+    for(int i = 0; i < natoms; ++i){
+        std::cout << i+1 << "\t" << std::setw(20) << std::setprecision(10) << forces[i][0]*sf <<
+                                    std::setw(20) << std::setprecision(10) << forces[i][1]*sf <<
+                                    std::setw(20) << std::setprecision(10) << forces[i][2]*sf << std::endl;
+    }
+    sf = -OpenMM::KcalPerKJ/10.0;
+    std::cout << "Energy (AKMA):" << std::setw(20) << std::setprecision(10) << energy*OpenMM::KcalPerKJ << std::endl;
+    std::cout << "Forces (AKMA):" << std::endl;
+    for(int i = 0; i < natoms; ++i){
+        std::cout << i+1 << "\t" << std::setw(20) << std::setprecision(10) << forces[i][0]*sf <<
+                                    std::setw(20) << std::setprecision(10) << forces[i][1]*sf <<
+                                    std::setw(20) << std::setprecision(10) << forces[i][2]*sf << std::endl;
+    }
+}
+// compare forces and energies
+static void compareForcesEnergy(std::string& testName, double expectedEnergy, double energy,
+                                const std::vector<Vec3>& expectedForces,
+                                const std::vector<Vec3>& forces, double tolerance) {
+    for (unsigned int ii = 0; ii < forces.size(); ii++) {
+        ASSERT_EQUAL_VEC_MOD(expectedForces[ii], forces[ii], tolerance, testName);
+    }
+    ASSERT_EQUAL_TOL_MOD(expectedEnergy, energy, tolerance, testName);
+}
+// compare relative differences in force norms and energies
+static void compareForceNormsEnergy(std::string& testName, double expectedEnergy, double energy,
+                                    std::vector<Vec3>& expectedForces,
+                                    const std::vector<Vec3>& forces, double tolerance) {
+    for (unsigned int ii = 0; ii < forces.size(); ii++) {
+        double expectedNorm = sqrt(expectedForces[ii][0]*expectedForces[ii][0] +
+                                   expectedForces[ii][1]*expectedForces[ii][1] +
+                                   expectedForces[ii][2]*expectedForces[ii][2]);
+        double norm         = sqrt(forces[ii][0]*forces[ii][0] + forces[ii][1]*forces[ii][1] + forces[ii][2]*forces[ii][2]);
+        double absDiff      = fabs(norm - expectedNorm);
+        double relDiff      = 2.0*absDiff/(fabs(norm) + fabs(expectedNorm) + 1.0e-08);
+        if (relDiff > tolerance && absDiff > 0.001) {
+            std::stringstream details;
+            details << testName << "Relative difference in norms " << relDiff << " larger than allowed tolerance at particle=" << ii;
+            details << ": norms=" << norm << " expected norm=" << expectedNorm;
+            throwException(__FILE__, __LINE__, details.str());
+        }
+    }
+    double energyAbsDiff = fabs(expectedEnergy - energy);
+    double energyRelDiff =  2.0*energyAbsDiff/(fabs(expectedEnergy) + fabs(energy) + 1.0e-08);
+    if (energyRelDiff > tolerance) {
+        std::stringstream details;
+        details << testName << "Relative difference in energies " << energyRelDiff << " larger than allowed tolerance.";
+        details << "Energies=" << energy << " expected energy=" << expectedEnergy;
+        throwException(__FILE__, __LINE__, details.str());
+    }
+}
+vector<Vec3> setupWaterDimer(System& system,  AmoebaMultipoleForce* amoebaMultipoleForce, bool use_pol_groups) {
+    const int NATOMS = 6;
+    const char* atom_types[NATOMS] = {"O", "H1", "H2", "O", "H1", "H2"};
+    const double coords[NATOMS][3] = {
+        {  2.000000, 2.000000, 2.000000},
+        {  2.500000, 2.000000, 3.000000},
+        {  1.500000, 2.000000, 3.000000},
+        {  0.000000, 0.000000, 0.000000},
+        {  0.500000, 0.000000, 1.000000},
+        { -0.500000, 0.000000, 1.000000}
+    };
+    std::map < std::string, double > tholemap;
+    std::map < std::string, double > polarmap;
+    std::map < std::string, double > chargemap;
+    std::map < std::string, std::vector<double> > dipolemap;
+    std::map < std::string, std::vector<double> > quadrupolemap;
+    std::map < std::string, AmoebaMultipoleForce::MultipoleAxisTypes > axesmap;
+    std::map < std::string, std::vector<int> > anchormap;
+    std::map < std::string, double > massmap;
+    std::map < std::string, std::vector<int> > polgrpmap;
+    std::map < std::string, std::vector<int> > cov12map;
+    std::map < std::string, std::vector<int> > cov13map;
+    axesmap["O"]  = AmoebaMultipoleForce::Bisector;
+    axesmap["H1"] = AmoebaMultipoleForce::ZThenX;
+    axesmap["H2"] = AmoebaMultipoleForce::ZThenX;
+    chargemap["O"]  = -0.51966;
+    chargemap["H1"] = 0.25983;
+    chargemap["H2"] = 0.25983;
+    int oanc[3] = {1, 2, 0};
+    int h1anc[3] = {-1, 1, 0};
+    int h2anc[3] = {-2, -1, 0};
+    std::vector<int> oancv(&oanc[0], &oanc[3]);
+    std::vector<int> h1ancv(&h1anc[0], &h1anc[3]);
+    std::vector<int> h2ancv(&h2anc[0], &h2anc[3]);
+    anchormap["O"]  = oancv;
+    anchormap["H1"] = h1ancv;
+    anchormap["H2"] = h2ancv;
+    double od[3] = {0.0, 0.0, 0.00755612136146};
+    double hd[3] = {-0.00204209484795, 0.0, -0.00307875299958};
+    std::vector<double> odv(&od[0], &od[3]);
+    std::vector<double> hdv(&hd[0], &hd[3]);
+    dipolemap["O"]  = odv;
+    dipolemap["H1"] = hdv;
+    dipolemap["H2"] = hdv;
+    double oq[9] = {0.000354030721139,  0.0, 0.0,
+                    0.0, -0.000390257077096, 0.0,
+                    0.0, 0.0,  3.62263559571e-05};
+    double hq[9] = {-3.42848248983e-05, 0.0, -1.89485963908e-06,
+                     0.0,          -0.000100240875193,      0.0,
+                    -1.89485963908e-06, 0.0,  0.000134525700091};
+    std::vector<double> oqv(&oq[0], &oq[9]);
+    std::vector<double> hqv(&hq[0], &hq[9]);
+    quadrupolemap["O"]  = oqv;
+    quadrupolemap["H1"] = hqv;
+    quadrupolemap["H2"] = hqv;
+    polarmap["O"]  = 0.3069876538;
+    polarmap["H1"] = 0.2813500172;
+    polarmap["H2"] = 0.2813500172;
+    polarmap["O"]  = 0.000837;
+    polarmap["H1"] = 0.000496;
+    polarmap["H2"] = 0.000496;
+    tholemap["O"]  = 0.3900;
+    tholemap["H1"] = 0.3900;
+    tholemap["H2"] = 0.3900;
+    massmap["O"]  = 15.999;
+    massmap["H1"] = 1.0080000;
+    massmap["H2"] = 1.0080000;
+    int opg[3] = {0,1,2};
+    int h1pg[3] = {-1,0,1};
+    int h2pg[3] = {-2,-1,0};
+    std::vector<int> opgv(&opg[0], &opg[3]);
+    std::vector<int> h1pgv(&h1pg[0], &h1pg[3]);
+    std::vector<int> h2pgv(&h2pg[0], &h2pg[3]);
+    if(!use_pol_groups){
+        opgv.clear();
+        h1pgv.clear();
+        h2pgv.clear();
+    }
+    polgrpmap["O"] = opgv;
+    polgrpmap["H1"] = h1pgv;
+    polgrpmap["H2"] = h2pgv;
+    int cov12o[2] = {1,2};
+    int cov12h1[1] = {-1};
+    int cov12h2[1] = {-2};
+    std::vector<int> cov12ov(&cov12o[0], &cov12o[2]);
+    std::vector<int> cov12h1v(&cov12h1[0], &cov12h1[1]);
+    std::vector<int> cov12h2v(&cov12h2[0], &cov12h2[1]);
+    cov12map["O"] = cov12ov;
+    cov12map["H1"] = cov12h1v;
+    cov12map["H2"] = cov12h2v;
+    int cov13h1[1] = {1};
+    int cov13h2[1] = {-1};
+    std::vector<int> cov13h1v(&cov13h1[0], &cov13h1[1]);
+    std::vector<int> cov13h2v(&cov13h2[0], &cov13h2[1]);
+    cov13map["O"] = std::vector<int>();
+    cov13map["H1"] = cov13h1v;
+    cov13map["H2"] = cov13h2v;
+    std::vector<Vec3> positions(NATOMS);
+    for(int atom = 0; atom < NATOMS; ++atom){
+        const char* element = atom_types[atom];
+        double damp = polarmap[element];
+        double alpha = pow(damp, 1.0/6.0);
+        int atomz = atom + anchormap[element][0];
+        int atomx = atom + anchormap[element][1];
+        int atomy = anchormap[element][2]==0 ? -1 : atom + anchormap[element][2];
+        amoebaMultipoleForce->addMultipole(chargemap[element], dipolemap[element], quadrupolemap[element],
+                                           axesmap[element], atomz, atomx, atomy, tholemap[element], alpha, damp);
+        system.addParticle(massmap[element]);
+        double offset =0.0;
+        positions[atom] = Vec3(coords[atom][0]+offset, coords[atom][1]+offset, coords[atom][2]+offset)*OpenMM::NmPerAngstrom;
+        // Polarization groups
+        std::vector<int> tmppol;
+        std::vector<int>& polgrps = polgrpmap[element];
+        for(int i=0; i < polgrps.size(); ++i)
+            tmppol.push_back(polgrps[i]+atom);
+        if(!tmppol.empty())
+           amoebaMultipoleForce->setCovalentMap(atom, AmoebaMultipoleForce::PolarizationCovalent11, tmppol);
+        // 1-2 covalent groups
+        std::vector<int> tmp12;
+        std::vector<int>& cov12s = cov12map[element];
+        for(int i=0; i < cov12s.size(); ++i)
+            tmp12.push_back(cov12s[i]+atom);
+        if(!tmp12.empty())
+           amoebaMultipoleForce->setCovalentMap(atom, AmoebaMultipoleForce::Covalent12, tmp12);
+        // 1-3 covalent groups
+        std::vector<int> tmp13;
+        std::vector<int>& cov13s = cov13map[element];
+        for(int i=0; i < cov13s.size(); ++i)
+            tmp13.push_back(cov13s[i]+atom);
+        if(!tmp13.empty())
+           amoebaMultipoleForce->setCovalentMap(atom, AmoebaMultipoleForce::Covalent13, tmp13);
+    }
+    system.addForce(amoebaMultipoleForce);
+    return positions;
+}
+static void check_finite_differences(vector<Vec3> analytic_forces, Context &context, vector<Vec3> positions)
+{
+    // Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
+    double norm = 0.0;
+    for (int i = 0; i < (int) analytic_forces.size(); ++i)
+        norm += analytic_forces[i].dot(analytic_forces[i]);
+    norm = std::sqrt(norm);
+    const double stepSize = 1e-3;
+    double step = 0.5*stepSize/norm;
+    vector<Vec3> positions2(analytic_forces.size()), positions3(analytic_forces.size());
+    for (int i = 0; i < (int) positions.size(); ++i) {
+        Vec3 p = positions[i];
+        Vec3 f = analytic_forces[i];
+        positions2[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+        positions3[i] = Vec3(p[0]+f[0]*step, p[1]+f[1]*step, p[2]+f[2]*step);
+    }
+    context.setPositions(positions2);
+    State state2 = context.getState(State::Energy);
+    context.setPositions(positions3);
+    State state3 = context.getState(State::Energy);
+    ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state3.getPotentialEnergy())/stepSize, 1e-4);
+}
+static void testWaterDimerTriclinicPME() {
+    std::string testName      = "testWaterDimerTriclinicPME";
+    System system;
+    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
+    vector<Vec3> coords = setupWaterDimer(system, amoebaMultipoleForce, true);
+    system.setDefaultPeriodicBoxVectors(Vec3(2.0, 0.0, 0.0),
+                                        Vec3(0.2, 2.0, 0.0),
+                                        Vec3(0.1, 0.5, 2.0));
+    amoebaMultipoleForce->setNonbondedMethod(AmoebaMultipoleForce::PME);
+    amoebaMultipoleForce->setPolarizationType(AmoebaMultipoleForce::Extrapolated);
+    std::vector<double> coefs;
+    coefs.push_back(0.0);  // The mu_0 coefficient
+    coefs.push_back(-0.3); // The mu_1 coefficient
+    coefs.push_back(0.0);  // The mu_2 coefficient
+    coefs.push_back(1.3);  // The mu_3 coefficient
+    amoebaMultipoleForce->setExtrapolationCoefficients(coefs);
+    amoebaMultipoleForce->setCutoffDistance(9.0*OpenMM::NmPerAngstrom);
+    amoebaMultipoleForce->setAEwald(4);
+    amoebaMultipoleForce->setEwaldErrorTolerance(1.0e-06);
+    std::vector<int> pmeGridDimension(3);
+    pmeGridDimension[0] = pmeGridDimension[1] = pmeGridDimension[2] = 64;
+    amoebaMultipoleForce->setPmeGridDimensions(pmeGridDimension);
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName("CUDA"));
+    context.setPositions(coords);
+    OpenMM::State state = context.getState(State::Forces | State::Energy);
+    std::vector<Vec3> forces = state.getForces();
+    double energy = state.getPotentialEnergy();
+//    printEnergyAndForces(energy, forces);
+    double expectedEnergy     = -1.945797427;
+    std::vector<Vec3> expectedForces(forces.size());
+    expectedForces[0] = Vec3(  -131.1099603,   -187.2725558,    36.94657685);
+    expectedForces[1] = Vec3(    38.6397841,    2.410997985,    8.008437937);
+    expectedForces[2] = Vec3(   38.69034185,    117.5018257,    32.43097836);
+    expectedForces[3] = Vec3(  -117.3212339,   -102.3366145,   -30.50621066);
+    expectedForces[4] = Vec3(   124.8343077,    169.7729804,   -24.10742414);
+    expectedForces[5] = Vec3(   46.26244074, -0.07194110719,   -22.77727325);
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy(testName, expectedEnergy, energy, expectedForces, forces, tolerance);
+    check_finite_differences(forces, context, coords);
+}
+static void testWaterDimerTriclinicPMENoPolGroups() {
+    std::string testName      = "testWaterDimerTriclinicPMENoPolGroups";
+    System system;
+    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
+    vector<Vec3> coords = setupWaterDimer(system, amoebaMultipoleForce, false);
+    system.setDefaultPeriodicBoxVectors(Vec3(2.0, 0.0, 0.0),
+                                        Vec3(0.2, 2.0, 0.0),
+                                        Vec3(0.1, 0.5, 2.0));
+    amoebaMultipoleForce->setNonbondedMethod(AmoebaMultipoleForce::PME);
+    amoebaMultipoleForce->setPolarizationType(AmoebaMultipoleForce::Extrapolated);
+    std::vector<double> coefs;
+    coefs.push_back(0.0);  // The mu_0 coefficient
+    coefs.push_back(-0.3); // The mu_1 coefficient
+    coefs.push_back(0.0);  // The mu_2 coefficient
+    coefs.push_back(1.3);  // The mu_3 coefficient
+    amoebaMultipoleForce->setExtrapolationCoefficients(coefs);
+    amoebaMultipoleForce->setCutoffDistance(9.0*OpenMM::NmPerAngstrom);
+    amoebaMultipoleForce->setAEwald(4);
+    amoebaMultipoleForce->setEwaldErrorTolerance(1.0e-06);
+    std::vector<int> pmeGridDimension(3);
+    pmeGridDimension[0] = pmeGridDimension[1] = pmeGridDimension[2] = 64;
+    amoebaMultipoleForce->setPmeGridDimensions(pmeGridDimension);
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName("CUDA"));
+    context.setPositions(coords);
+    OpenMM::State state = context.getState(State::Forces | State::Energy);
+    std::vector<Vec3> forces = state.getForces();
+    double energy = state.getPotentialEnergy();
+//    printEnergyAndForces(energy, forces);
+    double expectedEnergy     =  -1.840068409;
+    std::vector<Vec3> expectedForces(forces.size());
+    expectedForces[0] = Vec3(  -69.85154559,  -104.2092334,   3.586495334);
+    expectedForces[1] = Vec3(   19.50350452,   -14.5844519,   9.400418341);
+    expectedForces[2] = Vec3(   16.75641493,   75.15006506,   19.14553199);
+    expectedForces[3] = Vec3(  -67.24268213,  -47.39994175,  -18.81277222);
+    expectedForces[4] = Vec3(   75.15808251,   110.6109313,   4.355432435);
+    expectedForces[5] = Vec3(   25.67255306,  -19.56378113,  -17.68217953);
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy(testName, expectedEnergy, energy, expectedForces, forces, tolerance);
+    check_finite_differences(forces, context, coords);
+}
+static void testWaterDimerNoCutoff() {
+    std::string testName      = "testWaterDimerNoCutoff";
+    System system;
+    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
+    vector<Vec3> coords = setupWaterDimer(system, amoebaMultipoleForce, true);
+    amoebaMultipoleForce->setNonbondedMethod(AmoebaMultipoleForce::NoCutoff);
+    amoebaMultipoleForce->setPolarizationType(AmoebaMultipoleForce::Extrapolated);
+    std::vector<double> coefs;
+    coefs.push_back(0.0);  // The mu_0 coefficient
+    coefs.push_back(-0.3); // The mu_1 coefficient
+    coefs.push_back(0.0);  // The mu_2 coefficient
+    coefs.push_back(1.3);  // The mu_3 coefficient
+    amoebaMultipoleForce->setExtrapolationCoefficients(coefs);
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName("CUDA"));
+    context.setPositions(coords);
+    OpenMM::State state = context.getState(State::Forces | State::Energy);
+    std::vector<Vec3> forces = state.getForces();
+    double energy = state.getPotentialEnergy();
+//    printEnergyAndForces(energy, forces);
+    double expectedEnergy     = -1.399194432;
+    std::vector<Vec3> expectedForces(forces.size());
+    expectedForces[0] = Vec3( -130.7294487,   -186.3287444,    41.40628056);
+    expectedForces[1] = Vec3(  38.90143386,    2.140957908,    5.564712102);
+    expectedForces[2] = Vec3(  38.32881448,    117.0462626,    29.90093041);
+    expectedForces[3] = Vec3( -117.1147396,   -101.6981494,   -25.55733439);
+    expectedForces[4] = Vec3(  124.7421318,    169.1571359,   -26.38724373);
+    expectedForces[5] = Vec3(  45.87180816,  -0.3174626947,   -24.92734495);
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy(testName, expectedEnergy, energy, expectedForces, forces, tolerance);
+    check_finite_differences(forces, context, coords);
+}
+static void testWaterDimerNoCutoffNoPolGroups() {
+    std::string testName      = "testWaterDimerNoCutoffNoPolGroups";
+    System system;
+    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
+    vector<Vec3> coords = setupWaterDimer(system, amoebaMultipoleForce, false);
+    amoebaMultipoleForce->setNonbondedMethod(AmoebaMultipoleForce::NoCutoff);
+    amoebaMultipoleForce->setPolarizationType(AmoebaMultipoleForce::Extrapolated);
+    std::vector<double> coefs;
+    coefs.push_back(0.0);  // The mu_0 coefficient
+    coefs.push_back(-0.3); // The mu_1 coefficient
+    coefs.push_back(0.0);  // The mu_2 coefficient
+    coefs.push_back(1.3);  // The mu_3 coefficient
+    amoebaMultipoleForce->setExtrapolationCoefficients(coefs);
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName("CUDA"));
+    context.setPositions(coords);
+    OpenMM::State state = context.getState(State::Forces | State::Energy);
+    std::vector<Vec3> forces = state.getForces();
+    double energy = state.getPotentialEnergy();
+//    printEnergyAndForces(energy, forces);
+    double expectedEnergy = -1.56926564;
+    std::vector<Vec3> expectedForces(forces.size());
+    expectedForces[0] = Vec3(   -69.623843,  -103.7006124,   6.162774255);
+    expectedForces[1] = Vec3(  19.54326912,  -14.69441322,   8.014369439);
+    expectedForces[2] = Vec3(  16.65441143,   74.88100242,   17.70364405);
+    expectedForces[3] = Vec3( -67.10049929,  -47.08900953,  -16.01092086);
+    expectedForces[4] = Vec3(  74.98800293,   110.2649458,   3.020145768);
+    expectedForces[5] = Vec3(  25.53865881,  -19.66191302,  -18.89001266);
+    double tolerance          = 1.0e-04;
+    compareForcesEnergy(testName, expectedEnergy, energy, expectedForces, forces, tolerance);
+    check_finite_differences(forces, context, coords);
+}
+int main(int numberOfArguments, char* argv[]) {
+    try {
+        registerAmoebaCudaKernelFactories();
+        /*
+         * Water dimer energy / force tests under various conditions.
+         */
+        // PME, triclinic
+        testWaterDimerTriclinicPME();
+        // PME, triclinic, no polarization groups
+        testWaterDimerTriclinicPMENoPolGroups();
+        // No cutoff
+        testWaterDimerNoCutoff();
+        // No cutoff, no polarization groups
+        testWaterDimerNoCutoffNoPolGroups();
+    }
+    catch(const std::exception& e) {
+        std::cout << "exception: " << e.what() << std::endl;
+        std::cout << "FAIL - ERROR.  Test failed." << std::endl;
+        return 1;
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}
--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
@@ -25,6 +25,7 @@
 #include "AmoebaReferenceMultipoleForce.h"
 #include "jama_svd.h"
 #include <algorithm>
 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
 #include "openmm/internal/MSVC_erfc.h"
@@ -1785,10 +1786,6 @@ RealOpenMM AmoebaReferenceMultipoleForce::calculateElectrostatic(const vector<Mu
                                                                 vector<RealVec>& torques,
                                                                 vector<RealVec>& forces)
 {
-    const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
-    const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
-    const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
    RealOpenMM energy = 0.0;
    vector<RealOpenMM> scaleFactors(LAST_SCALE_TYPE_INDEX);
    for (unsigned int kk = 0; kk < scaleFactors.size(); kk++) {
@@ -1796,6 +1793,7 @@ RealOpenMM AmoebaReferenceMultipoleForce::calculateElectrostatic(const vector<Mu
    }   
    // main loop over particle pairs
    for (unsigned int ii = 0; ii < particleData.size(); ii++) {
        for (unsigned int jj = ii+1; jj < particleData.size(); jj++) {
@@ -6072,7 +6070,6 @@ void AmoebaReferencePmeMultipoleForce::recordInducedDipoleField(vector<RealVec>&
        fieldPolar[i][1] -= _phip[10*i+1]*fracToCart[1][0] + _phip[10*i+2]*fracToCart[1][1] + _phip[10*i+3]*fracToCart[1][2];
        fieldPolar[i][2] -= _phip[10*i+1]*fracToCart[2][0] + _phip[10*i+2]*fracToCart[2][1] + _phip[10*i+3]*fracToCart[2][2];
    }
 }
 void AmoebaReferencePmeMultipoleForce::calculateReciprocalSpaceInducedDipoleField(vector<UpdateInducedDipoleFieldStruct>& updateInducedDipoleFields)
@@ -6238,7 +6235,6 @@ void AmoebaReferencePmeMultipoleForce::calculateDirectInducedDipolePairIxns(cons
    alsq2n                *= alsq2;
    RealOpenMM bn3         = (5.0*bn2+alsq2n*exp2a)/r2;
    // compute the error function scaled and unscaled terms
    RealOpenMM scale3      = 1.0;

--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
@@ -673,8 +673,6 @@ protected:
    std::vector<std::vector<RealOpenMM> > _ptDipoleFieldGradientP;
    std::vector<std::vector<RealOpenMM> > _ptDipoleFieldGradientD;
    int _mutualInducedDipoleConverged;
    int _mutualInducedDipoleIterations;
    int _maximumMutualInducedDipoleIterations;

--- a/plugins/amoeba/platforms/reference/tests/TestReferenceAmoebaPTPolarization.cpp
+++ b/plugins/amoeba/platforms/reference/tests/TestReferenceAmoebaPTPolarization.cpp
@@ -30,7 +30,7 @@
 * -------------------------------------------------------------------------- */
 /**
- * This tests the Reference implementation of the PT polarization algorithms in ReferenceAmoebaMultipoleForce.
+ * This tests the Reference implementation of the extrapolated polarization algorithms in AmoebaMultipoleForce.
 */
 #include "openmm/internal/AssertionUtilities.h"
@@ -307,9 +307,9 @@ static void check_finite_differences(vector<Vec3> analytic_forces, Context &cont
 }
-static void testWaterDimerExPTPolarizationTriclinicPME() {
+static void testWaterDimerTriclinicPME() {
-    std::string testName      = "testWaterDimerExPTPolarizationTriclinicPME";
+    std::string testName      = "testWaterDimerTriclinicPME";
    System system;
    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
@@ -356,9 +356,9 @@ static void testWaterDimerExPTPolarizationTriclinicPME() {
 }
-static void testWaterDimerExPTPolarizationTriclinicPMENoPolGroups() {
+static void testWaterDimerTriclinicPMENoPolGroups() {
-    std::string testName      = "testWaterDimerExPTPolarizationTriclinicPMENoPolGroups";
+    std::string testName      = "testWaterDimerTriclinicPMENoPolGroups";
    System system;
    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
@@ -406,9 +406,9 @@ static void testWaterDimerExPTPolarizationTriclinicPMENoPolGroups() {
 }
-static void testWaterDimerExPTPolarizationNoCutoff() {
+static void testWaterDimerNoCutoff() {
-    std::string testName      = "testWaterDimerExPTPolarizationNoCutoff";
+    std::string testName      = "testWaterDimerNoCutoff";
    System system;
    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
@@ -447,9 +447,9 @@ static void testWaterDimerExPTPolarizationNoCutoff() {
 }
-static void testWaterDimerExPTPolarizationNoCutoffNoPolGroups() {
+static void testWaterDimerNoCutoffNoPolGroups() {
-    std::string testName      = "testWaterDimerExPTPolarizationNoCutoffNoPolGroups";
+    std::string testName      = "testWaterDimerNoCutoffNoPolGroups";
    System system;
    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
@@ -499,16 +499,16 @@ int main(int numberOfArguments, char* argv[]) {
         */
        // PME, triclinic
-        testWaterDimerExPTPolarizationTriclinicPME();
+        testWaterDimerTriclinicPME();
        // PME, triclinic, no polarization groups
-        testWaterDimerExPTPolarizationTriclinicPMENoPolGroups();
+        testWaterDimerTriclinicPMENoPolGroups();
        // No cutoff
-        testWaterDimerExPTPolarizationNoCutoff();
+        testWaterDimerNoCutoff();
        // No cutoff, no polarization groups
-        testWaterDimerExPTPolarizationNoCutoffNoPolGroups();
+        testWaterDimerNoCutoffNoPolGroups();
    }
    catch(const std::exception& e) {