Merge remote-tracking branch 'upstream/master' into forcefield

39120086 · John Chodera (MSKCC) · 87b95319 · 2acba7ad · 39120086 · 39120086
Commit 39120086 authored Jan 18, 2016 by John Chodera (MSKCC)
20 changed files
--- a/README.md
+++ b/README.md
@@ -18,8 +18,6 @@ Need Help? Check out the [documentation](https://simtk.org/docman/?group_id=161)
 Badges
 ------
 * Travis CI `linux` integration tests: [![Build Status](https://travis-ci.org/pandegroup/openmm.png?branch=master)](https://travis-ci.org/pandegroup/openmm)
-* Jenkins `openmm-dev` conda `osx` build: [![Jenkins `openmm-dev` conda `osx` build](https://jenkins.choderalab.org/job/conda-openmm-dev-osx-2/badge/icon)](https://jenkins.choderalab.org/job/conda-openmm-dev-osx-2/) [[console log]](https://jenkins.choderalab.org/job/conda-openmm-dev-osx-2/lastBuild/consoleFull)
+* Anaconda Cloud `openmm` conda release: ![Binstar `openmm` conda release](https://binstar.org/omnia/openmm/badges/version.svg)
-* Jenkins `openmm-dev` conda `linux` build: [![Jenkins `openmm-dev` conda `linux` build](https://jenkins.choderalab.org/job/conda-openmm-dev-linux-vagrant-2/badge/icon)](https://jenkins.choderalab.org/job/conda-openmm-dev-linux-vagrant-2/) [[console log]](https://jenkins.choderalab.org/job/conda-openmm-dev-linux-vagrant-2/lastBuild/consoleFull)
+* Anaconda Cloud `openmm-dev` conda package: ![Binstar `openmm-dev` conda package](https://binstar.org/omnia/openmm-dev/badges/version.svg)
-* Binstar `openmm` conda release: ![Binstar `openmm` conda release](https://binstar.org/omnia/openmm/badges/version.svg)
-* Binstar `openmm-dev` conda package: ![Binstar `openmm-dev` conda package](https://binstar.org/omnia/openmm-dev/badges/version.svg)
--- a/docs-source/usersguide/application.rst
+++ b/docs-source/usersguide/application.rst
@@ -922,29 +922,33 @@ If :code:`vdwCutoff` is not specified, then the value of
 Specifying the Polarization Method
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-OpenMM allows the setting of several other parameters particular to the AMOEBA
+When using the AMOEBA force field, OpenMM allows the induced dipoles to be
-force field.  The :code:`mutualInducedTargetEpsilon` option allows you to
+calculated in any of three different ways.  The slowest but potentially most
-specify the accuracy to which the induced dipoles are calculated at each time
+accurate method is to iterate the calculation until the dipoles converge to a
-step; the default value is 0.01.  The :code:`polarization` setting
+specified tolerance.  To select this, specify :code:`polarization='mutual'`.
-determines whether the calculation of the induced dipoles is continued until the
+Use the :code:`mutualInducedTargetEpsilon` option to select the tolerance; for
-dipoles are self-consistent to within the tolerance specified by
+most situations, a value of 0.00001 works well.  Alternatively you can specify
-:code:`mutualInducedTargetEpsilon` or whether a quick estimate of the induced
+:code:`polarization='extrapolated'`.  This uses an analytic approximation
-dipoles is used instead.  The first option corresponds to the
+:cite:`Simmonett2015` to estimate what the fully converged dipoles will be without
-:code:`polarization='mutual'` setting and is the default; the quick estimate
+actually continuing the calculation to convergence.  In many cases this can be
-option is given by :code:`polarization='direct'` and in this case,
+significantly faster with only a small loss in accuracy.  Finally, you can
-:code:`mutualInducedTargetEpsilon` is ignored, if provided.  Simulations using
+specify :code:`polarization='direct'` to use the direct polarization
-:code:`polarization='direct'` will be significantly faster than those with
+approximation, in which induced dipoles depend only on the fixed multipoles, not
-:code:`polarization='mutual'`\ , but less accurate.  Examples using the two
+on other induced dipoles.  This is even faster, but it produces very different
-options are given below:
+forces from mutual polarization, so it should only be used with force fields
-::
+that have been specifically parameterized for use with this approximation.
-    system = forcefield.createSystem(nonbondedMethod=PME,
+Here are examples of using each method:
-        nonbondedCutoff=1*nanometer,ewaldErrorTolerance=0.00001,
+::
-        vdwCutoff=1.2*nanometer, mutualInducedTargetEpsilon=0.01)
+    system = forcefield.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
-    system = forcefield.createSystem(nonbondedMethod=PME,
+        vdwCutoff=1.2*nanometer, polarization='mutual', mutualInducedTargetEpsilon=0.00001)
-        nonbondedCutoff=1*nanometer,ewaldErrorTolerance=0.00001,
-        vdwCutoff=1.2*nanometer, polarization ='direct')
+    system = forcefield.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
+        vdwCutoff=1.2*nanometer, polarization='extrapolated')
+    system = forcefield.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
+        vdwCutoff=1.2*nanometer, polarization='direct')
 Implicit Solvent and Solute Dielectrics

--- a/docs-source/usersguide/references.bib
+++ b/docs-source/usersguide/references.bib
@@ -389,6 +389,17 @@
   type = {Journal Article}
 }
+@article{Simmonett2015
+   author = {Simmonett, Andrew C. and Pickard, Frank C. and Shao, Yihan and Cheatham, Thomas E. and Brooks, Bernard R.},
+   title = {Efficient treatment of induced dipoles},
+   journal = {Journal of Chemical Physics},
+   year = {2015},
+   volume = {143},
+   number = {7},
+   pages = {074115},
+   type = {Journal Article}
+}
 @article{Sindhikara2009,
  author =   {Sindhikara, Daniel J. and Kim, Seonah and Voter,
                  Arthur F. and Roitberg, Adrian E.},

--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -36,22 +36,18 @@ def runOneTest(testName, options):
    if amoeba:
        constraints = None
        epsilon = float(options.epsilon)
-        if epsilon == 0:
-            polarization = 'direct'
-        else:
-            polarization = 'mutual'
        if explicit:
            ff = app.ForceField('amoeba2009.xml')
            pdb = app.PDBFile('5dfr_solv-cube_equil.pdb')
            cutoff = 0.7*unit.nanometers
            vdwCutoff = 0.9*unit.nanometers
-            system = ff.createSystem(pdb.topology, nonbondedMethod=app.PME, nonbondedCutoff=cutoff, vdwCutoff=vdwCutoff, constraints=constraints, ewaldErrorTolerance=0.00075, mutualInducedTargetEpsilon=epsilon, polarization=polarization)
+            system = ff.createSystem(pdb.topology, nonbondedMethod=app.PME, nonbondedCutoff=cutoff, vdwCutoff=vdwCutoff, constraints=constraints, ewaldErrorTolerance=0.00075, mutualInducedTargetEpsilon=epsilon, polarization=options.polarization)
        else:
            ff = app.ForceField('amoeba2009.xml', 'amoeba2009_gk.xml')
            pdb = app.PDBFile('5dfr_minimized.pdb')
            cutoff = 2.0*unit.nanometers
            vdwCutoff = 1.2*unit.nanometers
-            system = ff.createSystem(pdb.topology, nonbondedMethod=app.NoCutoff, constraints=constraints, mutualInducedTargetEpsilon=epsilon, polarization=polarization)
+            system = ff.createSystem(pdb.topology, nonbondedMethod=app.NoCutoff, constraints=constraints, mutualInducedTargetEpsilon=epsilon, polarization=options.polarization)
        for f in system.getForces():
            if isinstance(f, mm.AmoebaMultipoleForce) or isinstance(f, mm.AmoebaVdwForce) or isinstance(f, mm.AmoebaGeneralizedKirkwoodForce) or isinstance(f, mm.AmoebaWcaDispersionForce):
                f.setForceGroup(1)
@@ -127,7 +123,8 @@ parser.add_option('--platform', dest='platform', choices=platformNames, help='na
 parser.add_option('--test', dest='test', choices=('gbsa', 'rf', 'pme', 'amoebagk', 'amoebapme'), help='the test to perform: gbsa, rf, pme, amoebagk, or amoebapme [default: all]')
 parser.add_option('--pme-cutoff', default='0.9', dest='cutoff', type='float', help='direct space cutoff for PME in nm [default: 0.9]')
 parser.add_option('--seconds', default='60', dest='seconds', type='float', help='target simulation length in seconds [default: 60]')
-parser.add_option('--mutual-epsilon', default='1e-4', dest='epsilon', type='float', help='mutual induced epsilon for AMOEBA [default: 1e-4]')
+parser.add_option('--polarization', default='mutual', dest='polarization', choices=('direct', 'extrapolated', 'mutual'), help='the polarization method for AMOEBA: direct, extrapolated, or mutual [default: mutual]')
+parser.add_option('--mutual-epsilon', default='1e-5', dest='epsilon', type='float', help='mutual induced epsilon for AMOEBA [default: 1e-5]')
 parser.add_option('--heavy-hydrogens', action='store_true', default=False, dest='heavy', help='repartition mass to allow a larger time step')
 parser.add_option('--device', default=None, dest='device', help='device index for CUDA or OpenCL')
 parser.add_option('--precision', default='single', dest='precision', choices=('single', 'mixed', 'double'), help='precision mode for CUDA or OpenCL: single, mixed, or double [default: single]')

--- a/openmmapi/src/ContextImpl.cpp
+++ b/openmmapi/src/ContextImpl.cpp
@@ -449,4 +449,5 @@ void ContextImpl::loadCheckpoint(istream& stream) {
        parameters[name] = value;
    }
    updateStateDataKernel.getAs<UpdateStateDataKernel>().loadCheckpoint(*this, stream);
+    hasSetPositions = true;
 }
--- a/platforms/cpu/tests/TestCpuCheckpoints.cpp
+++ b/platforms/cpu/tests/TestCpuCheckpoints.cpp
@@ -92,6 +92,15 @@ void testCheckpoint() {
    integrator.step(10);
    State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
    compareStates(s2, s4);
+    // See if a checkpoint created from one Context can be loaded into a different one.
+    VerletIntegrator integrator2(0.001);
+    Context context2(system, integrator2, platform);
+    stream1.seekg(0, stream1.beg);
+    context2.loadCheckpoint(stream1);
+    State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s5);
 }
 void runPlatformTests() {

--- a/platforms/cuda/tests/TestCudaCheckpoints.cpp
+++ b/platforms/cuda/tests/TestCudaCheckpoints.cpp
@@ -120,6 +120,15 @@ void testCheckpoint() {
    integrator2.step(10);
    State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
    compareStates(s6, s8);
+    // See if a checkpoint created from one Context can be loaded into a different one.
+    VerletIntegrator integrator3(0.001);
+    Context context3(system, integrator3, platform);
+    stream1.seekg(0, stream1.beg);
+    context3.loadCheckpoint(stream1);
+    State s9 = context3.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s9);
 }
 void runPlatformTests() {

--- a/platforms/opencl/tests/TestOpenCLCheckpoints.cpp
+++ b/platforms/opencl/tests/TestOpenCLCheckpoints.cpp
@@ -120,6 +120,15 @@ void testCheckpoint() {
    integrator2.step(10);
    State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
    compareStates(s6, s8);
+    // See if a checkpoint created from one Context can be loaded into a different one.
+    VerletIntegrator integrator3(0.001);
+    Context context3(system, integrator3, platform);
+    stream1.seekg(0, stream1.beg);
+    context3.loadCheckpoint(stream1);
+    State s9 = context3.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s9);
 }
 void runPlatformTests() {

--- a/platforms/reference/tests/TestReferenceCheckpoints.cpp
+++ b/platforms/reference/tests/TestReferenceCheckpoints.cpp
@@ -92,6 +92,15 @@ void testCheckpoint() {
    integrator.step(10);
    State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
    compareStates(s2, s4);
+    // See if a checkpoint created from one Context can be loaded into a different one.
+    VerletIntegrator integrator2(0.001);
+    Context context2(system, integrator2, platform);
+    stream1.seekg(0, stream1.beg);
+    context2.loadCheckpoint(stream1);
+    State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s5);
 }
 void runPlatformTests() {

--- a/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
 * Authors: Mark Friedrichs, Peter Eastman                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -72,14 +72,25 @@ public:
    enum PolarizationType {
        /**
-         * Mutual polarization
+         * Full mutually induced polarization.  The dipoles are iterated until the converge to the accuracy specified
+         * by getMutualInducedTargetEpsilon().
         */
        Mutual = 0,
        /**
-         * Direct polarization
+         * Direct polarization approximation.  The induced dipoles depend only on the fixed multipoles, not on other
+         * induced dipoles.
         */
-        Direct = 1
+        Direct = 1,
+        /**
+         * Extrapolated perturbation theory approximation.  The dipoles are iterated a few times, and then an analytic
+         * approximation is used to extrapolate to the fully converged values.  Call setExtrapolationCoefficients()
+         * to set the coefficients used for the extrapolation.  The default coefficients used in this release are
+         * [0, -0.3, 0, 1.3], but be aware that those may change in a future release.
+         */
+        Extrapolated = 2
    };
    enum MultipoleAxisTypes { ZThenX = 0, Bisector = 1, ZBisect = 2, ThreeFold = 3, ZOnly = 4, NoAxisType = 5, LastAxisTypeIndex = 6 };
@@ -298,6 +309,23 @@ public:
     */
    void setMutualInducedTargetEpsilon(double inputMutualInducedTargetEpsilon);
+    /**
+     * Set the coefficients for the mu_0, mu_1, mu_2, ..., mu_n terms in the extrapolation
+     * algorithm for induced dipoles.
+     *
+     * @param coefficients      a vector whose mth entry specifies the coefficient for mu_m.  The length of this
+     *                          vector determines how many iterations are performed.
+     *
+     */
+    void setExtrapolationCoefficients(const std::vector<double> &coefficients);
+    /**
+     * Get the coefficients for the mu_0, mu_1, mu_2, ..., mu_n terms in the extrapolation
+     * algorithm for induced dipoles.  In this release, the default values for the coefficients are
+     * [0, -0.3, 0, 1.3], but be aware that those may change in a future release.
+     */
+    const std::vector<double>& getExtrapolationCoefficients() const;
    /**
     * Get the error tolerance for Ewald summation.  This corresponds to the fractional error in the forces
     * which is acceptable.  This value is used to select the grid dimensions and separation (alpha)
@@ -384,6 +412,8 @@ private:
    int pmeBSplineOrder;
    std::vector<int> pmeGridDimension;
    int mutualInducedMaxIterations;
+    std::vector<double> extrapolationCoefficients;
    double mutualInducedTargetEpsilon;
    double scalingDistanceCutoff;
    double electricConstant;

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
 * Authors:                                                                   *
 * Contributors:                                                              *
 *                                                                            *
@@ -43,6 +43,10 @@ AmoebaMultipoleForce::AmoebaMultipoleForce() : nonbondedMethod(NoCutoff), polari
                                               mutualInducedTargetEpsilon(1.0e-02), scalingDistanceCutoff(100.0), electricConstant(138.9354558456), aewald(0.0) {
    pmeGridDimension.resize(3);
    pmeGridDimension[0] = pmeGridDimension[1] = pmeGridDimension[2];
+    extrapolationCoefficients.push_back(0.0);
+    extrapolationCoefficients.push_back(-0.3);
+    extrapolationCoefficients.push_back(0.0);
+    extrapolationCoefficients.push_back(1.3);
 }
 AmoebaMultipoleForce::NonbondedMethod AmoebaMultipoleForce::getNonbondedMethod() const {
@@ -61,6 +65,14 @@ void AmoebaMultipoleForce::setPolarizationType(AmoebaMultipoleForce::Polarizatio
    polarizationType = type;
 }
+void AmoebaMultipoleForce::setExtrapolationCoefficients(const std::vector<double> &coefficients) {
+    extrapolationCoefficients = coefficients;
+}
+const std::vector<double> & AmoebaMultipoleForce::getExtrapolationCoefficients() const {
+    return extrapolationCoefficients;
+}
 double AmoebaMultipoleForce::getCutoffDistance() const {
    return cutoffDistance;
 }

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -810,7 +810,10 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL),
        fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
        diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
-        prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
+        prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), extrapolatedDipole(NULL), extrapolatedDipolePolar(NULL),
+        extrapolatedDipoleGk(NULL), extrapolatedDipoleGkPolar(NULL), inducedDipoleFieldGradient(NULL), inducedDipoleFieldGradientPolar(NULL),
+        inducedDipoleFieldGradientGk(NULL), inducedDipoleFieldGradientGkPolar(NULL), extrapolatedDipoleFieldGradient(NULL), extrapolatedDipoleFieldGradientPolar(NULL),
+        extrapolatedDipoleFieldGradientGk(NULL), extrapolatedDipoleFieldGradientGkPolar(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
        pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), pmeAtomGridIndex(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
 }
@@ -867,6 +870,30 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete diisMatrix;
    if (diisCoefficients != NULL)
        delete diisCoefficients;
+    if (extrapolatedDipole != NULL)
+        delete extrapolatedDipole;
+    if (extrapolatedDipolePolar != NULL)
+        delete extrapolatedDipolePolar;
+    if (extrapolatedDipoleGk != NULL)
+        delete extrapolatedDipoleGk;
+    if (extrapolatedDipoleGkPolar != NULL)
+        delete extrapolatedDipoleGkPolar;
+    if (inducedDipoleFieldGradient != NULL)
+        delete inducedDipoleFieldGradient;
+    if (inducedDipoleFieldGradientPolar != NULL)
+        delete inducedDipoleFieldGradientPolar;
+    if (inducedDipoleFieldGradientGk != NULL)
+        delete inducedDipoleFieldGradientGk;
+    if (inducedDipoleFieldGradientGkPolar != NULL)
+        delete inducedDipoleFieldGradientGkPolar;
+    if (extrapolatedDipoleFieldGradient != NULL)
+        delete extrapolatedDipoleFieldGradient;
+    if (extrapolatedDipoleFieldGradientPolar != NULL)
+        delete extrapolatedDipoleFieldGradientPolar;
+    if (extrapolatedDipoleFieldGradientGk != NULL)
+        delete extrapolatedDipoleFieldGradientGk;
+    if (extrapolatedDipoleFieldGradientGkPolar != NULL)
+        delete extrapolatedDipoleFieldGradientGkPolar;
    if (polarizability != NULL)
        delete polarizability;
    if (covalentFlags != NULL)
@@ -967,6 +994,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    // Create workspace arrays.
+    polarizationType = force.getPolarizationType();
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
    labFrameDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles");
    labFrameQuadrupoles = new CudaArray(cu, 5*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
@@ -979,12 +1007,23 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    torque = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "torque");
    inducedDipole = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipole");
    inducedDipolePolar = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipolePolar");
-    inducedDipoleErrors = new CudaArray(cu, cu.getNumThreadBlocks(), sizeof(float2), "inducedDipoleErrors");
+    if (polarizationType == AmoebaMultipoleForce::Mutual) {
-    prevDipoles = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipoles");
+        inducedDipoleErrors = new CudaArray(cu, cu.getNumThreadBlocks(), sizeof(float2), "inducedDipoleErrors");
-    prevDipolesPolar = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesPolar");
+        prevDipoles = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipoles");
-    prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
+        prevDipolesPolar = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesPolar");
-    diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
+        prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
-    diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
+        diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
+        diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
+    }
+    else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+        int numOrders = force.getExtrapolationCoefficients().size();
+        extrapolatedDipole = new CudaArray(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipole");
+        extrapolatedDipolePolar = new CudaArray(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipolePolar");
+        inducedDipoleFieldGradient = new CudaArray(cu, 6*paddedNumAtoms, sizeof(long long), "inducedDipoleFieldGradient");
+        inducedDipoleFieldGradientPolar = new CudaArray(cu, 6*paddedNumAtoms, sizeof(long long), "inducedDipoleFieldGradientPolar");
+        extrapolatedDipoleFieldGradient = new CudaArray(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradient");
+        extrapolatedDipoleFieldGradientPolar = new CudaArray(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientPolar");
+    }
    cu.addAutoclearBuffer(*field);
    cu.addAutoclearBuffer(*fieldPolar);
    cu.addAutoclearBuffer(*torque);
@@ -1036,14 +1075,16 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    // Record other options.
-    if (force.getPolarizationType() == AmoebaMultipoleForce::Mutual) {
+    if (polarizationType == AmoebaMultipoleForce::Mutual) {
        maxInducedIterations = force.getMutualInducedMaxIterations();
        inducedEpsilon = force.getMutualInducedTargetEpsilon();
-        inducedField = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "inducedField");
-        inducedFieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "inducedFieldPolar");
    }
    else
        maxInducedIterations = 0;
+    if (polarizationType != AmoebaMultipoleForce::Direct) {
+        inducedField = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "inducedField");
+        inducedFieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "inducedFieldPolar");
+    }
    usePME = (force.getNonbondedMethod() == AmoebaMultipoleForce::PME);
    // See whether there's an AmoebaGeneralizedKirkwoodForce in the System.
@@ -1058,6 +1099,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    bool useShuffle = (cu.getComputeCapability() >= 3.0 && !cu.getUseDoublePrecision());
    double fixedThreadMemory = 19*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
    double inducedThreadMemory = 15*elementSize+2*sizeof(float);
+    if (polarizationType == AmoebaMultipoleForce::Extrapolated)
+        inducedThreadMemory += 12*elementSize;
    double electrostaticsThreadMemory = 0;
    if (!useShuffle)
        fixedThreadMemory += 3*elementSize;
@@ -1066,8 +1109,12 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
    defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456/innerDielectric);
-    if (force.getPolarizationType() == AmoebaMultipoleForce::Direct)
+    if (polarizationType == AmoebaMultipoleForce::Direct)
        defines["DIRECT_POLARIZATION"] = "";
+    else if (polarizationType == AmoebaMultipoleForce::Mutual)
+        defines["MUTUAL_POLARIZATION"] = "";
+    else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
+        defines["EXTRAPOLATED_POLARIZATION"] = "";
    if (useShuffle)
        defines["USE_SHUFFLE"] = "";
    if (hasQuadrupoles)
@@ -1080,6 +1127,18 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
    defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
    defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
+    maxExtrapolationOrder = force.getExtrapolationCoefficients().size();
+    defines["MAX_EXTRAPOLATION_ORDER"] = cu.intToString(maxExtrapolationOrder);
+    stringstream coefficients;
+    for (int i = 0; i < maxExtrapolationOrder; i++) {
+        if (i > 0)
+            coefficients << ",";
+        double sum = 0;
+        for (int j = i; j < maxExtrapolationOrder; j++)
+            sum += force.getExtrapolationCoefficients()[j];
+        coefficients << cu.doubleToString(sum);
+    }
+    defines["EXTRAPOLATION_COEFFICIENTS_SUM"] = coefficients.str();
    alpha = force.getAEwald();
    if (usePME) {
        vector<int> pmeGridDimension;
@@ -1113,8 +1172,20 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        defines["GK_FQ"] = cu.doubleToString(3*(1-solventDielectric)/(2+3*solventDielectric));
        fixedThreadMemory += 4*elementSize;
        inducedThreadMemory += 13*elementSize;
-        prevDipolesGk = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesGk");
+        if (polarizationType == AmoebaMultipoleForce::Mutual) {
-        prevDipolesGkPolar = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesGkPolar");
+            prevDipolesGk = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesGk");
+            prevDipolesGkPolar = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesGkPolar");
+        }
+        else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+            inducedThreadMemory += 12*elementSize;
+            int numOrders = force.getExtrapolationCoefficients().size();
+            extrapolatedDipoleGk = new CudaArray(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipoleGk");
+            extrapolatedDipoleGkPolar = new CudaArray(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipoleGkPolar");
+            inducedDipoleFieldGradientGk = new CudaArray(cu, 6*numMultipoles, elementSize, "inducedDipoleFieldGradientGk");
+            inducedDipoleFieldGradientGkPolar = new CudaArray(cu, 6*numMultipoles, elementSize, "inducedDipoleFieldGradientGkPolar");
+            extrapolatedDipoleFieldGradientGk = new CudaArray(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientGk");
+            extrapolatedDipoleFieldGradientGkPolar = new CudaArray(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientGkPolar");
+        }
    }
    int maxThreads = cu.getNonbondedUtilities().getForceThreadBlockSize();
    fixedFieldThreads = min(maxThreads, cu.computeThreadBlockSize(fixedThreadMemory));
@@ -1127,15 +1198,18 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(fixedFieldThreads);
    module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines);
    computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
-    if (maxInducedIterations > 0) {
+    if (polarizationType != AmoebaMultipoleForce::Direct) {
        defines["THREAD_BLOCK_SIZE"] = cu.intToString(inducedFieldThreads);
        defines["MAX_PREV_DIIS_DIPOLES"] = cu.intToString(MaxPrevDIISDipoles);
-        defines["USE_MUTUAL_POLARIZATION"] = "1";
        module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
        computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
        recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS");
        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
+        initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
+        iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
+        computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
+        addExtrapolatedGradientKernel = cu.getKernel(module, "addExtrapolatedFieldGradientToForce");
    }
    stringstream electrostaticsSource;
    electrostaticsSource << CudaKernelSources::vectorOps;
@@ -1166,8 +1240,12 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
        pmeDefines["M_PI"] = cu.doubleToString(M_PI);
        pmeDefines["SQRT_PI"] = cu.doubleToString(sqrt(M_PI));
-        if (force.getPolarizationType() == AmoebaMultipoleForce::Direct)
+        if (polarizationType == AmoebaMultipoleForce::Direct)
            pmeDefines["DIRECT_POLARIZATION"] = "";
+        else if (polarizationType == AmoebaMultipoleForce::Mutual)
+            pmeDefines["MUTUAL_POLARIZATION"] = "";
+        else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
+            pmeDefines["EXTRAPOLATED_POLARIZATION"] = "";
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
        pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
        pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
@@ -1407,7 +1485,6 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
    int numTileIndices = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-    void* npt = NULL;
    if (pmeGrid == NULL) {
        // Compute induced dipoles.
@@ -1436,25 +1513,10 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Iterate until the dipoles converge.
+        if (polarizationType == AmoebaMultipoleForce::Extrapolated)
+            computeExtrapolatedDipoles(NULL);
        for (int i = 0; i < maxInducedIterations; i++) {
-            cu.clearBuffer(*inducedField);
+            computeInducedField(NULL);
-            cu.clearBuffer(*inducedFieldPolar);
-            if (gkKernel == NULL) {
-                void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                    &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
-                    &dampingAndThole->getDevicePointer()};
-                cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
-            }
-            else {
-                cu.clearBuffer(*gkKernel->getInducedField());
-                cu.clearBuffer(*gkKernel->getInducedFieldPolar());
-                void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                    &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
-                    &gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(),
-                    &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
-                    &gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()};
-                cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
-            }
            bool converged = iterateDipolesByDIIS(i);
            if (converged)
                break;
@@ -1579,34 +1641,10 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Iterate until the dipoles converge.
-        vector<float2> errors;
+        if (polarizationType == AmoebaMultipoleForce::Extrapolated)
+            computeExtrapolatedDipoles(recipBoxVectorPointer);
        for (int i = 0; i < maxInducedIterations; i++) {
-            cu.clearBuffer(*inducedField);
+            computeInducedField(recipBoxVectorPointer);
-            cu.clearBuffer(*inducedFieldPolar);
-            void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
-                &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
-                cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
-                &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
-                &dampingAndThole->getDevicePointer()};
-            cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
-            cu.clearBuffer(*pmeGrid);
-            cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
-            if (cu.getUseDoublePrecision())
-                cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
-            if (cu.getUseDoublePrecision())
-                cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
-            else
-                cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
-            cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
-            if (cu.getUseDoublePrecision())
-                cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
-            else
-                cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
-            cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
-            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
-                &inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
-            cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
            bool converged = iterateDipolesByDIIS(i);
            if (converged)
                break;
@@ -1632,6 +1670,23 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            &pmePhip->getDevicePointer(), &pmePhidp->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms());
    }
+    // If using extrapolated polarization, add in force contributions from µ(m) T µ(n).
+    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+        if (gkKernel == NULL) {
+            void* extrapolatedArgs[] = {&cu.getForce().getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+                &extrapolatedDipolePolar->getDevicePointer(), &extrapolatedDipoleFieldGradient->getDevicePointer(), &extrapolatedDipoleFieldGradientPolar->getDevicePointer()};
+            cu.executeKernel(addExtrapolatedGradientKernel, extrapolatedArgs, numMultipoles);
+        }
+        else {
+            void* extrapolatedArgs[] = {&cu.getForce().getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+                &extrapolatedDipolePolar->getDevicePointer(), &extrapolatedDipoleFieldGradient->getDevicePointer(), &extrapolatedDipoleFieldGradientPolar->getDevicePointer(),
+                &extrapolatedDipoleGk->getDevicePointer(), &extrapolatedDipoleGkPolar->getDevicePointer(),
+                &extrapolatedDipoleFieldGradientGk->getDevicePointer(), &extrapolatedDipoleFieldGradientGkPolar->getDevicePointer()};
+            cu.executeKernel(addExtrapolatedGradientKernel, extrapolatedArgs, numMultipoles);
+        }
+    }
    // Map torques to force.
@@ -1646,6 +1701,109 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
    return 0.0;
 }
+void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVectorPointer) {
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    int startTileIndex = nb.getStartTileIndex();
+    int numTileIndices = nb.getNumTiles();
+    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
+    unsigned int maxTiles = 0;
+    vector<void*> computeInducedFieldArgs;
+    computeInducedFieldArgs.push_back(&inducedField->getDevicePointer());
+    computeInducedFieldArgs.push_back(&inducedFieldPolar->getDevicePointer());
+    computeInducedFieldArgs.push_back(&cu.getPosq().getDevicePointer());
+    computeInducedFieldArgs.push_back(&nb.getExclusionTiles().getDevicePointer());
+    computeInducedFieldArgs.push_back(&inducedDipole->getDevicePointer());
+    computeInducedFieldArgs.push_back(&inducedDipolePolar->getDevicePointer());
+    computeInducedFieldArgs.push_back(&startTileIndex);
+    computeInducedFieldArgs.push_back(&numTileIndices);
+    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+        computeInducedFieldArgs.push_back(&inducedDipoleFieldGradient->getDevicePointer());
+        computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientPolar->getDevicePointer());
+    }
+    if (pmeGrid != NULL) {
+        computeInducedFieldArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
+        computeInducedFieldArgs.push_back(&nb.getInteractionCount().getDevicePointer());
+        computeInducedFieldArgs.push_back(cu.getPeriodicBoxSizePointer());
+        computeInducedFieldArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecXPointer());
+        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecYPointer());
+        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecZPointer());
+        computeInducedFieldArgs.push_back(&maxTiles);
+        computeInducedFieldArgs.push_back(&nb.getBlockCenters().getDevicePointer());
+        computeInducedFieldArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
+    }
+    if (gkKernel != NULL) {
+        computeInducedFieldArgs.push_back(&gkKernel->getInducedField()->getDevicePointer());
+        computeInducedFieldArgs.push_back(&gkKernel->getInducedFieldPolar()->getDevicePointer());
+        computeInducedFieldArgs.push_back(&gkKernel->getInducedDipoles()->getDevicePointer());
+        computeInducedFieldArgs.push_back(&gkKernel->getInducedDipolesPolar()->getDevicePointer());
+        computeInducedFieldArgs.push_back(&gkKernel->getBornRadii()->getDevicePointer());
+        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+            computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientGk->getDevicePointer());
+            computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientGkPolar->getDevicePointer());
+        }
+    }
+    computeInducedFieldArgs.push_back(&dampingAndThole->getDevicePointer());
+    cu.clearBuffer(*inducedField);
+    cu.clearBuffer(*inducedFieldPolar);
+    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+        cu.clearBuffer(*inducedDipoleFieldGradient);
+        cu.clearBuffer(*inducedDipoleFieldGradientPolar);
+    }
+    if (gkKernel != NULL) {
+        cu.clearBuffer(*gkKernel->getInducedField());
+        cu.clearBuffer(*gkKernel->getInducedFieldPolar());
+        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+            cu.clearBuffer(*inducedDipoleFieldGradientGk);
+            cu.clearBuffer(*inducedDipoleFieldGradientGkPolar);
+        }
+    }
+    if (pmeGrid == NULL)
+        cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
+    else {
+        maxTiles = nb.getInteractingTiles().getSize();
+        cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
+        cu.clearBuffer(*pmeGrid);
+        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
+            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
+        if (cu.getUseDoublePrecision()) {
+            void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
+            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
+        }
+        if (cu.getUseDoublePrecision())
+            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
+        else
+            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
+        void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
+            &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
+        if (cu.getUseDoublePrecision())
+            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
+        else
+            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
+        void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
+            &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
+            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2],
+            &pmeAtomGridIndex->getDevicePointer()};
+        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
+        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
+            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
+                &inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(),
+                &inducedDipolePolar->getDevicePointer(), &inducedDipoleFieldGradient->getDevicePointer(), &inducedDipoleFieldGradientPolar->getDevicePointer(),
+                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
+        }
+        else {
+            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
+                &inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
+                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
+        }
+    }
+}
 bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    void* npt = NULL;
    bool trueValue = true, falseValue = false;
@@ -1744,6 +1902,62 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    return false;
 }
+void CudaCalcAmoebaMultipoleForceKernel::computeExtrapolatedDipoles(void** recipBoxVectorPointer) {
+    // Start by storing the direct dipoles as PT0
+    if (gkKernel == NULL) {
+        void* initArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+            &extrapolatedDipolePolar->getDevicePointer(), &inducedDipoleFieldGradient->getDevicePointer(), &inducedDipoleFieldGradientPolar->getDevicePointer()};
+        cu.executeKernel(initExtrapolatedKernel, initArgs, extrapolatedDipole->getSize());
+    }
+    else {
+        void* initArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+            &extrapolatedDipolePolar->getDevicePointer(), &inducedDipoleFieldGradient->getDevicePointer(), &inducedDipoleFieldGradientPolar->getDevicePointer(),
+            &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), &extrapolatedDipoleGk->getDevicePointer(),
+            &extrapolatedDipoleGkPolar->getDevicePointer(), &inducedDipoleFieldGradientGk->getDevicePointer(), &inducedDipoleFieldGradientGkPolar->getDevicePointer()};
+        cu.executeKernel(initExtrapolatedKernel, initArgs, extrapolatedDipole->getSize());
+    }
+    // Recursively apply alpha.Tau to the µ_(n) components to generate µ_(n+1), and store the result
+    for (int order = 1; order < maxExtrapolationOrder; ++order) {
+        computeInducedField(recipBoxVectorPointer);
+        if (gkKernel == NULL) {
+            void* iterateArgs[] = {&order, &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+                &extrapolatedDipolePolar->getDevicePointer(), &inducedDipoleFieldGradient->getDevicePointer(), &inducedDipoleFieldGradientPolar->getDevicePointer(),
+                &inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &extrapolatedDipoleFieldGradient->getDevicePointer(), &extrapolatedDipoleFieldGradientPolar->getDevicePointer(),
+                &polarizability->getDevicePointer()};
+            cu.executeKernel(iterateExtrapolatedKernel, iterateArgs, extrapolatedDipole->getSize());
+        }
+        else {
+            void* iterateArgs[] = {&order, &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+                &extrapolatedDipolePolar->getDevicePointer(), &inducedDipoleFieldGradient->getDevicePointer(), &inducedDipoleFieldGradientPolar->getDevicePointer(),
+                &inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &extrapolatedDipoleFieldGradient->getDevicePointer(), &extrapolatedDipoleFieldGradientPolar->getDevicePointer(),
+                &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), &extrapolatedDipoleGk->getDevicePointer(),
+                &extrapolatedDipoleGkPolar->getDevicePointer(), &inducedDipoleFieldGradientGk->getDevicePointer(), &inducedDipoleFieldGradientGkPolar->getDevicePointer(),
+                &gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(),
+                &extrapolatedDipoleFieldGradientGk->getDevicePointer(), &extrapolatedDipoleFieldGradientGkPolar->getDevicePointer(),
+                &polarizability->getDevicePointer()};
+            cu.executeKernel(iterateExtrapolatedKernel, iterateArgs, extrapolatedDipole->getSize());
+        }
+    }
+    // Take a linear combination of the µ_(n) components to form the total dipole
+    if (gkKernel == NULL) {
+        void* computeArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+                &extrapolatedDipolePolar->getDevicePointer()};
+        cu.executeKernel(computeExtrapolatedKernel, computeArgs, extrapolatedDipole->getSize());
+    }
+    else {
+        void* computeArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &extrapolatedDipole->getDevicePointer(),
+                &extrapolatedDipolePolar->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
+                &extrapolatedDipoleGk->getDevicePointer(), &extrapolatedDipoleGkPolar->getDevicePointer()};
+        cu.executeKernel(computeExtrapolatedKernel, computeArgs, extrapolatedDipole->getSize());
+    }
+    computeInducedField(recipBoxVectorPointer);
+}
 void CudaCalcAmoebaMultipoleForceKernel::ensureMultipolesValid(ContextImpl& context) {
    if (multipolesAreValid) {
        int numParticles = cu.getNumAtoms();
@@ -2088,7 +2302,8 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
    bornForce = CudaArray::create<long long>(cu, paddedNumAtoms, "bornForce");
    inducedDipoleS = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipoleS");
    inducedDipolePolarS = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipolePolarS");
-    if (multipoles->getPolarizationType() == AmoebaMultipoleForce::Mutual) {
+    polarizationType = multipoles->getPolarizationType();
+    if (polarizationType != AmoebaMultipoleForce::Direct) {
        inducedField = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "gkInducedField");
        inducedFieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "gkInducedFieldPolar");
    }
@@ -2141,8 +2356,12 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
    defines["EPSILON_FACTOR"] = cu.doubleToString(138.9354558456);
    defines["M_PI"] = cu.doubleToString(M_PI);
    defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456/force.getSoluteDielectric());
-    if (multipoles->getPolarizationType() == AmoebaMultipoleForce::Direct)
+    if (polarizationType == AmoebaMultipoleForce::Direct)
        defines["DIRECT_POLARIZATION"] = "";
+    else if (polarizationType == AmoebaMultipoleForce::Mutual)
+        defines["MUTUAL_POLARIZATION"] = "";
+    else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
+        defines["EXTRAPOLATED_POLARIZATION"] = "";
    includeSurfaceArea = force.getIncludeCavityTerm();
    if (includeSurfaceArea) {
        defines["SURFACE_AREA_FACTOR"] = cu.doubleToString(force.getSurfaceAreaFactor());

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -385,14 +385,17 @@ private:
        const char* getSortKey() const {return "value.y";}
    };
    void initializeScaleFactors();
+    void computeInducedField(void** recipBoxVectorPointer);
    bool iterateDipolesByDIIS(int iteration);
+    void computeExtrapolatedDipoles(void** recipBoxVectorPointer);
    void ensureMultipolesValid(ContextImpl& context);
    template <class T, class T4, class M4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments);
-    int numMultipoles, maxInducedIterations;
+    int numMultipoles, maxInducedIterations, maxExtrapolationOrder;
    int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
    double alpha, inducedEpsilon;
    bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid;
+    AmoebaMultipoleForce::PolarizationType polarizationType;
    CudaContext& cu;
    const System& system;
    std::vector<int3> covalentFlagValues;
@@ -422,6 +425,18 @@ private:
    CudaArray* prevErrors;
    CudaArray* diisMatrix;
    CudaArray* diisCoefficients;
+    CudaArray* extrapolatedDipole;
+    CudaArray* extrapolatedDipolePolar;
+    CudaArray* extrapolatedDipoleGk;
+    CudaArray* extrapolatedDipoleGkPolar;
+    CudaArray* inducedDipoleFieldGradient;
+    CudaArray* inducedDipoleFieldGradientPolar;
+    CudaArray* inducedDipoleFieldGradientGk;
+    CudaArray* inducedDipoleFieldGradientGkPolar;
+    CudaArray* extrapolatedDipoleFieldGradient;
+    CudaArray* extrapolatedDipoleFieldGradientPolar;
+    CudaArray* extrapolatedDipoleFieldGradientGk;
+    CudaArray* extrapolatedDipoleFieldGradientGkPolar;
    CudaArray* polarizability;
    CudaArray* covalentFlags;
    CudaArray* polarizationGroupFlags;
@@ -444,6 +459,7 @@ private:
    CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
    CUfunction recordDIISDipolesKernel, buildMatrixKernel;
+    CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
    CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
    CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
    static const int PmeOrder = 5;
@@ -512,6 +528,7 @@ private:
    const System& system;
    bool includeSurfaceArea, hasInitializedKernels;
    int computeBornSumThreads, gkForceThreads, chainRuleThreads, ediffThreads;
+    AmoebaMultipoleForce::PolarizationType polarizationType;
    std::map<std::string, std::string> defines;
    CudaArray* params;
    CudaArray* bornSum;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaInPlaneForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaInPlaneForce.cu
@@ -39,15 +39,23 @@ real ym = zcp*xap - xcp*zap;
 real zm = xcp*yap - ycp*xap;
 real rm = max(SQRT(xm*xm + ym*ym + zm*zm), (real) 1e-6f);
-real dot = xap*xcp + yap*ycp + zap*zcp;
+real dotp = xap*xcp + yap*ycp + zap*zcp;
 real product = SQRT(rap2*rcp2);
-real cosine = (product > 0 ? (dot/product) : 0);
+real cosine = (product > 0 ? (dotp/product) : 0);
 cosine = max(min(cosine, (real) 1), (real) -1);
-real angle = ACOS(cosine);
+real angle;
+if (cosine > 0.99f || cosine < -0.99f) {
+    real3 cross_prod = cross(make_real3(xap, yap, zap), make_real3(xcp, ycp, zcp));
+    angle = ASIN(SQRT(dot(cross_prod, cross_prod)/(rap2*rcp2)))*RAD_TO_DEG;
+    if (cosine < 0.0f)
+        angle = 180-angle;
+}
+else
+    angle = ACOS(cosine)*RAD_TO_DEG;
 // if product == 0, set force/energy to 0
-real deltaIdeal = (product > 0 ? (angle*RAD_TO_DEG - angleParams.x) : 0);
+real deltaIdeal = (product > 0 ? (angle - angleParams.x) : 0);
 real deltaIdeal2 = deltaIdeal*deltaIdeal;
 real deltaIdeal3 = deltaIdeal*deltaIdeal2;
 real deltaIdeal4 = deltaIdeal2*deltaIdeal2;
@@ -111,4 +119,4 @@ real dedzid = -dedzia - dedzib - dedzic;
 real3 force1 = make_real3(-dedxia, -dedyia, -dedzia);
 real3 force2 = make_real3(-dedxib, -dedyib, -dedzib);
 real3 force3 = make_real3(-dedxic, -dedyic, -dedzic);
 real3 force4 = make_real3(-dedxid, -dedyid, -dedzid);
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaStretchBendForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaStretchBendForce.cu
@@ -17,18 +17,27 @@ real zp = xcb*yab - ycb*xab;
 real rp = SQRT(xp*xp + yp*yp + zp*zp);
-real dot = xab*xcb + yab*ycb + zab*zcb;
+real dotp = xab*xcb + yab*ycb + zab*zcb;
-real cosine = rab*rcb > 0 ? (dot / (rab*rcb)) : (real) 1;
+real cosine = rab*rcb > 0 ? (dotp / (rab*rcb)) : (real) 1;
 cosine = (cosine > 1 ? (real) 1 : cosine);
 cosine = (cosine < -1 ? -(real) 1 : cosine);
-real angle = ACOS(cosine);
+real angle;
+if (cosine > 0.99f || cosine < -0.99f) {
+    // Highly unlikely a stretch-bend angle will be near 0 or 180, but just in case...
+    real3 cross_prod = cross(make_real3(xab, yab, zab), make_real3(xcb, ycb, zcb));
+    angle = ASIN(SQRT(dot(cross_prod, cross_prod))/(rab*rcb))*RAD_TO_DEG;
+    if (cosine < 0.0f)
+        angle = 180-angle;
+}
+else
+    angle = ACOS(cosine)*RAD_TO_DEG;
 // find chain rule terms for the bond angle deviation
 float3 parameters = PARAMS[index];
 float2 force_constants = FORCE_CONSTANTS[index];
-real dt = RAD_TO_DEG*(angle - parameters.z);
+real dt = angle - RAD_TO_DEG*parameters.z;
 real terma = rab*rp != 0 ? (-RAD_TO_DEG/(rab*rab*rp)) : (real) 0;
 real termc = rcb*rp != 0 ? (RAD_TO_DEG/(rcb*rcb*rp)) : (real) 0;

--- a/plugins/amoeba/platforms/cuda/src/kernels/gkEDiffPairForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/gkEDiffPairForce.cu
@@ -365,7 +365,22 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    // correction to convert mutual to direct polarization force
-#ifdef DIRECT_POLARIZATION
+#ifdef MUTUAL_POLARIZATION
+    real findmp1 = uscale*(scip2*ddsc3_1 - ddsc5_1*(sci3*scip4+scip3*sci4));
+    real findmp2 = uscale*(scip2*ddsc3_2 - ddsc5_2*(sci3*scip4+scip3*sci4));
+    real findmp3 = uscale*(scip2*ddsc3_3 - ddsc5_3*(sci3*scip4+scip3*sci4));
+    ftm2i1 -= 0.5f*findmp1;
+    ftm2i2 -= 0.5f*findmp2;
+    ftm2i3 -= 0.5f*findmp3;
+    real sci3X = sci3 - sci3Y;
+    real sci4X = sci4 - sci4Y;
+    real scip3X = scip3 - scip3Y;
+    real scip4X = scip4 - scip4Y;
+    ftm2i1 += 0.5f*uscale*(-ddsc5_1*(sci3X*scip4X+scip3X*sci4X));
+    ftm2i2 += 0.5f*uscale*(-ddsc5_2*(sci3X*scip4X+scip3X*sci4X));
+    ftm2i3 += 0.5f*uscale*(-ddsc5_3*(sci3X*scip4X+scip3X*sci4X)); 
+#else
    real gfd = (scip2*scale3i - 5*rr2*(scip3*sci4+sci3*scip4)*scale5i);
    real fdir1 = gfd*xr + scale5i* (sci4*atom1.inducedDipolePolarS.x+scip4*atom1.inducedDipoleS.x + sci3*atom2.inducedDipolePolarS.x+scip3*atom2.inducedDipoleS.x);
    real fdir2 = gfd*yr + scale5i* (sci4*atom1.inducedDipolePolarS.y+scip4*atom1.inducedDipoleS.y + sci3*atom2.inducedDipolePolarS.y+scip3*atom2.inducedDipoleS.y);
@@ -385,21 +400,6 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    ftm2i1 += 0.5f*fdir1;
    ftm2i2 += 0.5f*fdir2;
    ftm2i3 += 0.5f*fdir3;
-#else
-    real findmp1 = uscale*(scip2*ddsc3_1 - ddsc5_1*(sci3*scip4+scip3*sci4));
-    real findmp2 = uscale*(scip2*ddsc3_2 - ddsc5_2*(sci3*scip4+scip3*sci4));
-    real findmp3 = uscale*(scip2*ddsc3_3 - ddsc5_3*(sci3*scip4+scip3*sci4));
-    ftm2i1 -= 0.5f*findmp1;
-    ftm2i2 -= 0.5f*findmp2;
-    ftm2i3 -= 0.5f*findmp3;
-    real sci3X = sci3 - sci3Y;
-    real sci4X = sci4 - sci4Y;
-    real scip3X = scip3 - scip3Y;
-    real scip4X = scip4 - scip4Y;
-    ftm2i1 += 0.5f*uscale*(-ddsc5_1*(sci3X*scip4X+scip3X*sci4X));
-    ftm2i2 += 0.5f*uscale*(-ddsc5_2*(sci3X*scip4X+scip3X*sci4X));
-    ftm2i3 += 0.5f*uscale*(-ddsc5_3*(sci3X*scip4X+scip3X*sci4X)); 
 #endif
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
@@ -327,7 +327,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
    real iEIY = qiUinpI.x*Vijp[1] + qiUindI.x*Vijd[1] - qiUinpI.y*Vijp[0] - qiUindI.y*Vijd[0];
    real iEJY = qiUinpJ.x*Vjip[1] + qiUindJ.x*Vjid[1] - qiUinpJ.y*Vjip[0] - qiUindJ.y*Vjid[0];
-#ifdef USE_MUTUAL_POLARIZATION
+#ifdef MUTUAL_POLARIZATION
    // Uind-Uind terms (m=0)
    real eCoef = -4*rInvVec[3]*thole_d0;
    real dCoef = 6*rInvVec[4]*dthole_d0;

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
@@ -3,9 +3,15 @@
 typedef struct {
    real3 pos;
    real3 field, fieldPolar, inducedDipole, inducedDipolePolar;
+#ifdef EXTRAPOLATED_POLARIZATION
+    real fieldGradient[6], fieldGradientPolar[6];
+#endif
 #ifdef USE_GK
    real3 fieldS, fieldPolarS, inducedDipoleS, inducedDipolePolarS;
    real bornRadius;
+    #ifdef EXTRAPOLATED_POLARIZATION
+        real fieldGradientS[6], fieldGradientPolarS[6];
+    #endif
 #endif
    float thole, damp;
 } AtomData;
@@ -47,17 +53,79 @@ inline __device__ void zeroAtomData(AtomData& data) {
    data.fieldS = make_real3(0);
    data.fieldPolarS = make_real3(0);
 #endif
+#ifdef EXTRAPOLATED_POLARIZATION
+    for (int i = 0; i < 6; i++) {
+        data.fieldGradient[i] = 0;
+        data.fieldGradientPolar[i] = 0;
+#ifdef USE_GK
+        data.fieldGradientS[i] = 0;
+        data.fieldGradientPolarS[i] = 0;
+#endif
+    }
+#endif
+}
+#ifdef EXTRAPOLATED_POLARIZATION
+    #ifdef USE_GK
+        #define SAVE_ATOM_DATA(index, data) saveAtomData(index, data, field, fieldPolar, fieldGradient, fieldGradientPolar, fieldS, fieldPolarS, fieldGradientS, fieldGradientPolarS);
+    #else
+        #define SAVE_ATOM_DATA(index, data) saveAtomData(index, data, field, fieldPolar, fieldGradient, fieldGradientPolar);
+    #endif
+#else
+    #ifdef USE_GK
+        #define SAVE_ATOM_DATA(index, data) saveAtomData(index, data, field, fieldPolar, fieldS, fieldPolarS);
+    #else
+        #define SAVE_ATOM_DATA(index, data) saveAtomData(index, data, field, fieldPolar);
+    #endif
+#endif
+inline __device__ void saveAtomData(int index, AtomData& data, unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar
+#ifdef EXTRAPOLATED_POLARIZATION
+        , unsigned long long* __restrict__ fieldGradient, unsigned long long* __restrict__ fieldGradientPolar
+#endif
+#ifdef USE_GK
+        , unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS
+    #ifdef EXTRAPOLATED_POLARIZATION
+        , unsigned long long* __restrict__ fieldGradientS, unsigned long long* __restrict__ fieldGradientPolarS
+    #endif
+#endif
+        ) {
+    atomicAdd(&field[index], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
+    atomicAdd(&field[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
+    atomicAdd(&field[index+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
+    atomicAdd(&fieldPolar[index], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
+    atomicAdd(&fieldPolar[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
+    atomicAdd(&fieldPolar[index+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
+#ifdef USE_GK
+    atomicAdd(&fieldS[index], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
+    atomicAdd(&fieldS[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
+    atomicAdd(&fieldS[index+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
+    atomicAdd(&fieldPolarS[index], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
+    atomicAdd(&fieldPolarS[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
+    atomicAdd(&fieldPolarS[index+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
+#endif
+#ifdef EXTRAPOLATED_POLARIZATION
+    for (int i = 0; i < 6; i++) {
+        atomicAdd(&fieldGradient[6*index+i], static_cast<unsigned long long>((long long) (data.fieldGradient[i]*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*index+i], static_cast<unsigned long long>((long long) (data.fieldGradientPolar[i]*0x100000000)));
+#ifdef USE_GK
+        atomicAdd(&fieldGradientS[6*index+i], static_cast<unsigned long long>((long long) (data.fieldGradientS[i]*0x100000000)));
+        atomicAdd(&fieldGradientPolarS[6*index+i], static_cast<unsigned long long>((long long) (data.fieldGradientPolarS[i]*0x100000000)));
+#endif
+    }
+#endif
 }
 #ifdef USE_EWALD
 __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, bool isSelfInteraction) {
    if (isSelfInteraction)
        return;
-    real scale1, scale2;
+    real scale1, scale2, scale3;
    real r2 = dot(deltaR, deltaR);
    if (r2 < CUTOFF_SQUARED) {
        real rI = RSQRT(r2);
        real r = RECIP(rI);
+        real rI2 = rI*rI;
        // calculate the error function damping terms
@@ -77,36 +145,40 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
        real alsq2 = 2*EWALD_ALPHA*EWALD_ALPHA;
        real alsq2n = RECIP(SQRT_PI*EWALD_ALPHA);
        alsq2n *= alsq2;
-        real bn1 = (bn0+alsq2n*exp2a)*rI*rI;
+        real bn1 = (bn0+alsq2n*exp2a)*rI2;
        alsq2n *= alsq2;
-        real bn2 = (3*bn1+alsq2n*exp2a)*rI*rI;
+        real bn2 = (3*bn1+alsq2n*exp2a)*rI2;
+        alsq2n *= alsq2;
+        real bn3 = (5*bn2+alsq2n*exp2a)*rI2;
        // compute the error function scaled and unscaled terms
-        real scale3 = 1;
-        real scale5 = 1;
        real damp = atom1.damp*atom2.damp;
        real ratio = (r/damp);
        ratio = ratio*ratio*ratio;
        float pgamma = atom1.thole < atom2.thole ? atom1.thole : atom2.thole;
        damp = damp == 0 ? 0 : -pgamma*ratio;
        real expdamp = EXP(damp);
-        scale3 = 1 - expdamp;
+        real dsc3 = 1 - expdamp;
-        scale5 = 1 - expdamp*(1-damp);
+        real dsc5 = 1 - expdamp*(1-damp);
-        real dsc3 = scale3;
+        real dsc7 = 1 - (1-damp+(0.6f*damp*damp))*expdamp;
-        real dsc5 = scale5;
        real r3 = (r*r2);
        real r5 = (r3*r2);
+        real r7 = (r5*r2);
        real rr3 = (1-dsc3)/r3;
        real rr5 = 3*(1-dsc5)/r5;
+        real rr7 = 15*(1-dsc7)/r7;
        scale1 = rr3 - bn1;
        scale2 = bn2 - rr5;
+        scale3 = bn3 - rr7;
    }
    else {
        scale1 = 0;
        scale2 = 0;
+        scale3 = 0;
    }
    real dDotDelta = scale2*dot(deltaR, atom2.inducedDipole);
    atom1.field += scale1*atom2.inducedDipole + dDotDelta*deltaR;
@@ -116,6 +188,45 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    atom2.field += scale1*atom1.inducedDipole + dDotDelta*deltaR;
    dDotDelta = scale2*dot(deltaR, atom1.inducedDipolePolar);
    atom2.fieldPolar += scale1*atom1.inducedDipolePolar + dDotDelta*deltaR;
+#ifdef EXTRAPOLATED_POLARIZATION
+    // Compute and store the field gradients for later use.
+    real3 dipole = atom1.inducedDipole;
+    real muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom2.fieldGradient[0] -= muDotR*deltaR.x*deltaR.x*scale3 - (2*dipole.x*deltaR.x + muDotR)*scale2;
+    atom2.fieldGradient[1] -= muDotR*deltaR.y*deltaR.y*scale3 - (2*dipole.y*deltaR.y + muDotR)*scale2;
+    atom2.fieldGradient[2] -= muDotR*deltaR.z*deltaR.z*scale3 - (2*dipole.z*deltaR.z + muDotR)*scale2;
+    atom2.fieldGradient[3] -= muDotR*deltaR.x*deltaR.y*scale3 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*scale2;
+    atom2.fieldGradient[4] -= muDotR*deltaR.x*deltaR.z*scale3 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*scale2;
+    atom2.fieldGradient[5] -= muDotR*deltaR.y*deltaR.z*scale3 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*scale2;
+    dipole = atom1.inducedDipolePolar;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom2.fieldGradientPolar[0] -= muDotR*deltaR.x*deltaR.x*scale3 - (2*dipole.x*deltaR.x + muDotR)*scale2;
+    atom2.fieldGradientPolar[1] -= muDotR*deltaR.y*deltaR.y*scale3 - (2*dipole.y*deltaR.y + muDotR)*scale2;
+    atom2.fieldGradientPolar[2] -= muDotR*deltaR.z*deltaR.z*scale3 - (2*dipole.z*deltaR.z + muDotR)*scale2;
+    atom2.fieldGradientPolar[3] -= muDotR*deltaR.x*deltaR.y*scale3 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*scale2;
+    atom2.fieldGradientPolar[4] -= muDotR*deltaR.x*deltaR.z*scale3 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*scale2;
+    atom2.fieldGradientPolar[5] -= muDotR*deltaR.y*deltaR.z*scale3 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*scale2;
+    dipole = atom2.inducedDipole;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom1.fieldGradient[0] += muDotR*deltaR.x*deltaR.x*scale3 - (2*dipole.x*deltaR.x + muDotR)*scale2;
+    atom1.fieldGradient[1] += muDotR*deltaR.y*deltaR.y*scale3 - (2*dipole.y*deltaR.y + muDotR)*scale2;
+    atom1.fieldGradient[2] += muDotR*deltaR.z*deltaR.z*scale3 - (2*dipole.z*deltaR.z + muDotR)*scale2;
+    atom1.fieldGradient[3] += muDotR*deltaR.x*deltaR.y*scale3 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*scale2;
+    atom1.fieldGradient[4] += muDotR*deltaR.x*deltaR.z*scale3 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*scale2;
+    atom1.fieldGradient[5] += muDotR*deltaR.y*deltaR.z*scale3 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*scale2;
+    dipole = atom2.inducedDipolePolar;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom1.fieldGradientPolar[0] += muDotR*deltaR.x*deltaR.x*scale3 - (2*dipole.x*deltaR.x + muDotR)*scale2;
+    atom1.fieldGradientPolar[1] += muDotR*deltaR.y*deltaR.y*scale3 - (2*dipole.y*deltaR.y + muDotR)*scale2;
+    atom1.fieldGradientPolar[2] += muDotR*deltaR.z*deltaR.z*scale3 - (2*dipole.z*deltaR.z + muDotR)*scale2;
+    atom1.fieldGradientPolar[3] += muDotR*deltaR.x*deltaR.y*scale3 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*scale2;
+    atom1.fieldGradientPolar[4] += muDotR*deltaR.x*deltaR.z*scale3 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*scale2;
+    atom1.fieldGradientPolar[5] += muDotR*deltaR.y*deltaR.z*scale3 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*scale2;
+#endif
 }
 #elif defined USE_GK
 __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, bool isSelfInteraction) {
@@ -182,6 +293,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    real r2I = rI*rI;
    real rr3 = -rI*r2I;
    real rr5 = -3*rr3*r2I;
+    real rr7 = 5*rr5*r2I;
    real dampProd = atom1.damp*atom2.damp;
    real ratio = (dampProd != 0 ? r/dampProd : 1);
    float pGamma = (atom2.thole > atom1.thole ? atom1.thole: atom2.thole);
@@ -189,6 +301,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    real dampExp = (dampProd != 0 ? EXP(-damp) : 0); 
    rr3 *= 1 - dampExp;
    rr5 *= 1 - (1+damp)*dampExp;
+    rr7 *= 1 - (1+damp+(0.6f*damp*damp))*dampExp;
    real dDotDelta = rr5*dot(deltaR, atom2.inducedDipole);
    atom1.field += rr3*atom2.inducedDipole + dDotDelta*deltaR;
    dDotDelta = rr5*dot(deltaR, atom2.inducedDipolePolar);
@@ -197,6 +310,45 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    atom2.field += rr3*atom1.inducedDipole + dDotDelta*deltaR;
    dDotDelta = rr5*dot(deltaR, atom1.inducedDipolePolar);
    atom2.fieldPolar += rr3*atom1.inducedDipolePolar + dDotDelta*deltaR;
+#ifdef EXTRAPOLATED_POLARIZATION
+    // Compute and store the field gradients for later use.
+    real3 dipole = atom1.inducedDipole;
+    real muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom2.fieldGradient[0] -= muDotR*deltaR.x*deltaR.x*rr7 - (2*dipole.x*deltaR.x + muDotR)*rr5;
+    atom2.fieldGradient[1] -= muDotR*deltaR.y*deltaR.y*rr7 - (2*dipole.y*deltaR.y + muDotR)*rr5;
+    atom2.fieldGradient[2] -= muDotR*deltaR.z*deltaR.z*rr7 - (2*dipole.z*deltaR.z + muDotR)*rr5;
+    atom2.fieldGradient[3] -= muDotR*deltaR.x*deltaR.y*rr7 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*rr5;
+    atom2.fieldGradient[4] -= muDotR*deltaR.x*deltaR.z*rr7 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*rr5;
+    atom2.fieldGradient[5] -= muDotR*deltaR.y*deltaR.z*rr7 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*rr5;
+    dipole = atom1.inducedDipolePolar;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom2.fieldGradientPolar[0] -= muDotR*deltaR.x*deltaR.x*rr7 - (2*dipole.x*deltaR.x + muDotR)*rr5;
+    atom2.fieldGradientPolar[1] -= muDotR*deltaR.y*deltaR.y*rr7 - (2*dipole.y*deltaR.y + muDotR)*rr5;
+    atom2.fieldGradientPolar[2] -= muDotR*deltaR.z*deltaR.z*rr7 - (2*dipole.z*deltaR.z + muDotR)*rr5;
+    atom2.fieldGradientPolar[3] -= muDotR*deltaR.x*deltaR.y*rr7 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*rr5;
+    atom2.fieldGradientPolar[4] -= muDotR*deltaR.x*deltaR.z*rr7 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*rr5;
+    atom2.fieldGradientPolar[5] -= muDotR*deltaR.y*deltaR.z*rr7 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*rr5;
+    dipole = atom2.inducedDipole;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom1.fieldGradient[0] += muDotR*deltaR.x*deltaR.x*rr7 - (2*dipole.x*deltaR.x + muDotR)*rr5;
+    atom1.fieldGradient[1] += muDotR*deltaR.y*deltaR.y*rr7 - (2*dipole.y*deltaR.y + muDotR)*rr5;
+    atom1.fieldGradient[2] += muDotR*deltaR.z*deltaR.z*rr7 - (2*dipole.z*deltaR.z + muDotR)*rr5;
+    atom1.fieldGradient[3] += muDotR*deltaR.x*deltaR.y*rr7 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*rr5;
+    atom1.fieldGradient[4] += muDotR*deltaR.x*deltaR.z*rr7 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*rr5;
+    atom1.fieldGradient[5] += muDotR*deltaR.y*deltaR.z*rr7 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*rr5;
+    dipole = atom2.inducedDipolePolar;
+    muDotR = dipole.x*deltaR.x + dipole.y*deltaR.y + dipole.z*deltaR.z;
+    atom1.fieldGradientPolar[0] += muDotR*deltaR.x*deltaR.x*rr7 - (2*dipole.x*deltaR.x + muDotR)*rr5;
+    atom1.fieldGradientPolar[1] += muDotR*deltaR.y*deltaR.y*rr7 - (2*dipole.y*deltaR.y + muDotR)*rr5;
+    atom1.fieldGradientPolar[2] += muDotR*deltaR.z*deltaR.z*rr7 - (2*dipole.z*deltaR.z + muDotR)*rr5;
+    atom1.fieldGradientPolar[3] += muDotR*deltaR.x*deltaR.y*rr7 - (dipole.x*deltaR.y + dipole.y*deltaR.x)*rr5;
+    atom1.fieldGradientPolar[4] += muDotR*deltaR.x*deltaR.z*rr7 - (dipole.x*deltaR.z + dipole.z*deltaR.x)*rr5;
+    atom1.fieldGradientPolar[5] += muDotR*deltaR.y*deltaR.z*rr7 - (dipole.y*deltaR.z + dipole.z*deltaR.y)*rr5;
+#endif
 }
 #endif
@@ -206,12 +358,18 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
 extern "C" __global__ void computeInducedField(
        unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq, const ushort2* __restrict__ exclusionTiles, 
        const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices,
+#ifdef EXTRAPOLATED_POLARIZATION
+        unsigned long long* __restrict__ fieldGradient, unsigned long long* __restrict__ fieldGradientPolar,
+#endif
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
 #elif defined USE_GK
        unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS,
        const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii,
+    #ifdef EXTRAPOLATED_POLARIZATION
+        unsigned long long* __restrict__ fieldGradientS, unsigned long long* __restrict__ fieldGradientPolarS,
+    #endif
 #endif
        const float2* __restrict__ dampingAndThole) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
@@ -284,36 +442,10 @@ extern "C" __global__ void computeInducedField(
        // Write results.
        unsigned int offset = x*TILE_SIZE + tgx;
-        atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
+        SAVE_ATOM_DATA(offset, data)
-        atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
-        atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
-        atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
-        atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
-        atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
-#ifdef USE_GK
-        atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
-        atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
-        atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
-        atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
-        atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
-        atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
-#endif
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
-            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
+            SAVE_ATOM_DATA(offset, localData[threadIdx.x])
-            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
-            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
-            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
-            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
-            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
-#ifdef USE_GK
-            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.x*0x100000000)));
-            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.y*0x100000000)));
-            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.z*0x100000000)));
-            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.x*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.y*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.z*0x100000000)));
-#endif
        }
    }
@@ -412,39 +544,13 @@ extern "C" __global__ void computeInducedField(
            // Write results.
            unsigned int offset = x*TILE_SIZE + tgx;
-            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
+            SAVE_ATOM_DATA(offset, data)
-            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
-            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
-            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
-            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
-            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
-#ifdef USE_GK
-            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
-            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
-            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
-            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
-#endif
 #ifdef USE_CUTOFF
            offset = atomIndices[threadIdx.x];
 #else
            offset = y*TILE_SIZE + tgx;
 #endif
-            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
+            SAVE_ATOM_DATA(offset, localData[threadIdx.x])
-            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
-            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
-            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
-            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
-            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
-#ifdef USE_GK
-            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.x*0x100000000)));
-            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.y*0x100000000)));
-            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.z*0x100000000)));
-            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.x*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.y*0x100000000)));
-            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.z*0x100000000)));
-#endif
        }
        pos++;
    }
@@ -533,8 +639,8 @@ extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restri
            real oldDipole = inducedDipole[dipoleIndex];
            real oldDipolePolar = inducedDipolePolar[dipoleIndex];
            long long fixedS = (fixedFieldS == NULL ? (long long) 0 : fixedFieldS[fieldIndex]);
-            real newDipole = scale*((fixedField[fieldIndex]+fixedS+inducedField[fieldIndex])*fieldScale+ewaldScale*oldDipole);
+            real newDipole = scale*((fixedField[fieldIndex]+fixedS+inducedField[fieldIndex])*fieldScale);
-            real newDipolePolar = scale*((fixedFieldPolar[fieldIndex]+fixedS+inducedFieldPolar[fieldIndex])*fieldScale+ewaldScale*oldDipolePolar);
+            real newDipolePolar = scale*((fixedFieldPolar[fieldIndex]+fixedS+inducedFieldPolar[fieldIndex])*fieldScale);
            int storePrevIndex = dipoleIndex+min(iteration, MAX_PREV_DIIS_DIPOLES-1)*NUM_ATOMS*3;
            prevDipoles[storePrevIndex] = newDipole;
            prevDipolesPolar[storePrevIndex] = newDipolePolar;
@@ -607,3 +713,146 @@ extern "C" __global__ void updateInducedFieldByDIIS(real* __restrict__ inducedDi
        inducedDipolePolar[index] = sumPolar;
    }
 }
+extern "C" __global__ void initExtrapolatedDipoles(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, real* __restrict__ extrapolatedDipole,
+        real* __restrict__ extrapolatedDipolePolar, long long* __restrict__ inducedDipoleFieldGradient, long long* __restrict__ inducedDipoleFieldGradientPolar
+#ifdef USE_GK
+        , real* __restrict__ inducedDipoleGk, real* __restrict__ inducedDipoleGkPolar, real* __restrict__ extrapolatedDipoleGk, real* __restrict__ extrapolatedDipoleGkPolar,
+        real* __restrict__ inducedDipoleFieldGradientGk, real* __restrict__ inducedDipoleFieldGradientGkPolar
+#endif
+        ) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        extrapolatedDipole[index] = inducedDipole[index];
+        extrapolatedDipolePolar[index] = inducedDipolePolar[index];
+#ifdef USE_GK
+        extrapolatedDipoleGk[index] = inducedDipoleGk[index];
+        extrapolatedDipoleGkPolar[index] = inducedDipoleGkPolar[index];
+#endif
+    }
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < 6*NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        inducedDipoleFieldGradient[index] = 0;
+        inducedDipoleFieldGradientPolar[index] = 0;
+#ifdef USE_GK
+        inducedDipoleFieldGradientGk[index] = 0;
+        inducedDipoleFieldGradientGkPolar[index] = 0;
+#endif
+    }
+}
+extern "C" __global__ void iterateExtrapolatedDipoles(int order, real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, real* __restrict__ extrapolatedDipole,
+        real* __restrict__ extrapolatedDipolePolar, long long* __restrict__ inducedDipoleFieldGradient, long long* __restrict__ inducedDipoleFieldGradientPolar,
+        long long* __restrict__ inducedDipoleField, long long* __restrict__ inducedDipoleFieldPolar, real* __restrict__ extrapolatedDipoleFieldGradient, real* __restrict__ extrapolatedDipoleFieldGradientPolar,
+#ifdef USE_GK
+        real* __restrict__ inducedDipoleGk, real* __restrict__ inducedDipoleGkPolar, real* __restrict__ extrapolatedDipoleGk, real* __restrict__ extrapolatedDipoleGkPolar,
+        real* __restrict__ inducedDipoleFieldGradientGk, real* __restrict__ inducedDipoleFieldGradientGkPolar, long long* __restrict__ inducedDipoleFieldGk,
+        long long* __restrict__ inducedDipoleFieldGkPolar, real* __restrict__ extrapolatedDipoleFieldGradientGk, real* __restrict__ extrapolatedDipoleFieldGradientGkPolar,
+#endif
+        const float* __restrict__ polarizability) {
+    const real fieldScale = 1/(real) 0x100000000;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        int atom = index/3;
+        int component = index-3*atom;
+        int fieldIndex = atom+component*PADDED_NUM_ATOMS;
+        float polar = polarizability[atom];
+        real value = inducedDipoleField[fieldIndex]*fieldScale*polar;
+        inducedDipole[index] = value;
+        extrapolatedDipole[order*3*NUM_ATOMS+index] = value;
+        value = inducedDipoleFieldPolar[fieldIndex]*fieldScale*polar;
+        inducedDipolePolar[index] = value;
+        extrapolatedDipolePolar[order*3*NUM_ATOMS+index] = value;
+#ifdef USE_GK
+        value = inducedDipoleFieldGk[fieldIndex]*fieldScale*polar;
+        inducedDipoleGk[index] = value;
+        extrapolatedDipoleGk[order*3*NUM_ATOMS+index] = value;
+        value = inducedDipoleFieldGkPolar[fieldIndex]*fieldScale*polar;
+        inducedDipoleGkPolar[index] = value;
+        extrapolatedDipoleGkPolar[order*3*NUM_ATOMS+index] = value;
+#endif
+    }
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < 6*NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        int index2 = (order-1)*6*NUM_ATOMS+index;
+        extrapolatedDipoleFieldGradient[index2] = fieldScale*inducedDipoleFieldGradient[index];
+        extrapolatedDipoleFieldGradientPolar[index2] = fieldScale*inducedDipoleFieldGradientPolar[index];
+#ifdef USE_GK
+        extrapolatedDipoleFieldGradientGk[index2] = fieldScale*inducedDipoleFieldGradientGk[index];
+        extrapolatedDipoleFieldGradientGkPolar[index2] = fieldScale*inducedDipoleFieldGradientGkPolar[index];
+#endif
+    }
+}
+extern "C" __global__ void computeExtrapolatedDipoles(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, real* __restrict__ extrapolatedDipole,
+        real* __restrict__ extrapolatedDipolePolar
+#ifdef USE_GK
+        , real* __restrict__ inducedDipoleGk, real* __restrict__ inducedDipoleGkPolar, real* __restrict__ extrapolatedDipoleGk, real* __restrict__ extrapolatedDipoleGkPolar
+#endif
+        ) {
+    real coeff[] = {EXTRAPOLATION_COEFFICIENTS_SUM};
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real sum = 0, sumPolar = 0, sumGk = 0, sumGkPolar = 0;
+        for (int order = 0; order < MAX_EXTRAPOLATION_ORDER; order++) {
+            sum += extrapolatedDipole[order*3*NUM_ATOMS+index]*coeff[order];
+            sumPolar += extrapolatedDipolePolar[order*3*NUM_ATOMS+index]*coeff[order];
+#ifdef USE_GK
+            sumGk += extrapolatedDipoleGk[order*3*NUM_ATOMS+index]*coeff[order];
+            sumGkPolar += extrapolatedDipoleGkPolar[order*3*NUM_ATOMS+index]*coeff[order];
+#endif
+        }
+        inducedDipole[index] = sum;
+        inducedDipolePolar[index] = sumPolar;
+#ifdef USE_GK
+        inducedDipoleGk[index] = sumGk;
+        inducedDipoleGkPolar[index] = sumGkPolar;
+#endif
+    }
+}
+extern "C" __global__ void addExtrapolatedFieldGradientToForce(long long* __restrict__ forceBuffers, real* __restrict__ extrapolatedDipole,
+        real* __restrict__ extrapolatedDipolePolar, real* __restrict__ extrapolatedDipoleFieldGradient, real* __restrict__ extrapolatedDipoleFieldGradientPolar
+#ifdef USE_GK
+        , real* __restrict__ extrapolatedDipoleGk, real* __restrict__ extrapolatedDipoleGkPolar,
+        real* __restrict__ extrapolatedDipoleFieldGradientGk, real* __restrict__ extrapolatedDipoleFieldGradientGkPolar
+#endif
+        ) {
+    real coeff[] = {EXTRAPOLATION_COEFFICIENTS_SUM};
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        real fx = 0, fy = 0, fz = 0;
+        for (int l = 0; l < MAX_EXTRAPOLATION_ORDER-1; l++) {
+            int index1 = 3*(l*NUM_ATOMS+atom);
+            real dipole[] = {extrapolatedDipole[index1], extrapolatedDipole[index1+1], extrapolatedDipole[index1+2]};
+            real dipolePolar[] = {extrapolatedDipolePolar[index1], extrapolatedDipolePolar[index1+1], extrapolatedDipolePolar[index1+2]};
+#ifdef USE_GK
+            real dipoleGk[] = {extrapolatedDipoleGk[index1], extrapolatedDipoleGk[index1+1], extrapolatedDipoleGk[index1+2]};
+            real dipoleGkPolar[] = {extrapolatedDipoleGkPolar[index1], extrapolatedDipoleGkPolar[index1+1], extrapolatedDipoleGkPolar[index1+2]};
+#endif
+            for (int m = 0; m < MAX_EXTRAPOLATION_ORDER-1-l; m++) {
+                int index2 = 6*(m*NUM_ATOMS+atom);
+                real scale = 0.5f*coeff[l+m+1]*ENERGY_SCALE_FACTOR;
+                real gradient[] = {extrapolatedDipoleFieldGradient[index2], extrapolatedDipoleFieldGradient[index2+1], extrapolatedDipoleFieldGradient[index2+2],
+                                   extrapolatedDipoleFieldGradient[index2+3], extrapolatedDipoleFieldGradient[index2+4], extrapolatedDipoleFieldGradient[index2+5]};
+                real gradientPolar[] = {extrapolatedDipoleFieldGradientPolar[index2], extrapolatedDipoleFieldGradientPolar[index2+1], extrapolatedDipoleFieldGradientPolar[index2+2],
+                                        extrapolatedDipoleFieldGradientPolar[index2+3], extrapolatedDipoleFieldGradientPolar[index2+4], extrapolatedDipoleFieldGradientPolar[index2+5]};
+                fx += scale*(dipole[0]*gradientPolar[0] + dipole[1]*gradientPolar[3] + dipole[2]*gradientPolar[4]);
+                fy += scale*(dipole[0]*gradientPolar[3] + dipole[1]*gradientPolar[1] + dipole[2]*gradientPolar[5]);
+                fz += scale*(dipole[0]*gradientPolar[4] + dipole[1]*gradientPolar[5] + dipole[2]*gradientPolar[2]);
+                fx += scale*(dipolePolar[0]*gradient[0] + dipolePolar[1]*gradient[3] + dipolePolar[2]*gradient[4]);
+                fy += scale*(dipolePolar[0]*gradient[3] + dipolePolar[1]*gradient[1] + dipolePolar[2]*gradient[5]);
+                fz += scale*(dipolePolar[0]*gradient[4] + dipolePolar[1]*gradient[5] + dipolePolar[2]*gradient[2]);
+#ifdef USE_GK
+                real gradientGk[] = {extrapolatedDipoleFieldGradient[index2], extrapolatedDipoleFieldGradient[index2+1], extrapolatedDipoleFieldGradient[index2+2],
+                                   extrapolatedDipoleFieldGradient[index2+3], extrapolatedDipoleFieldGradient[index2+4], extrapolatedDipoleFieldGradient[index2+5]};
+                real gradientGkPolar[] = {extrapolatedDipoleFieldGradientPolar[index2], extrapolatedDipoleFieldGradientPolar[index2+1], extrapolatedDipoleFieldGradientPolar[index2+2],
+                                        extrapolatedDipoleFieldGradientPolar[index2+3], extrapolatedDipoleFieldGradientPolar[index2+4], extrapolatedDipoleFieldGradientPolar[index2+5]};
+                fx += scale*(dipoleGk[0]*gradientGkPolar[0] + dipoleGk[1]*gradientGkPolar[3] + dipoleGk[2]*gradientGkPolar[4]);
+                fy += scale*(dipoleGk[0]*gradientGkPolar[3] + dipoleGk[1]*gradientGkPolar[1] + dipoleGk[2]*gradientGkPolar[5]);
+                fz += scale*(dipoleGk[0]*gradientGkPolar[4] + dipoleGk[1]*gradientGkPolar[5] + dipoleGk[2]*gradientGkPolar[2]);
+                fx += scale*(dipoleGkPolar[0]*gradientGk[0] + dipoleGkPolar[1]*gradientGk[3] + dipoleGkPolar[2]*gradientGk[4]);
+                fy += scale*(dipoleGkPolar[0]*gradientGk[3] + dipoleGkPolar[1]*gradientGk[1] + dipoleGkPolar[2]*gradientGk[5]);
+                fz += scale*(dipoleGkPolar[0]*gradientGk[4] + dipoleGkPolar[1]*gradientGk[5] + dipoleGkPolar[2]*gradientGk[2]);
+#endif
+            }
+        }
+        forceBuffers[atom] += (long long) (fx*0x100000000);
+        forceBuffers[atom+PADDED_NUM_ATOMS] += (long long) (fy*0x100000000);
+        forceBuffers[atom+PADDED_NUM_ATOMS*2] += (long long) (fz*0x100000000);
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
@@ -643,8 +643,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
    // Process the atoms in spatially sorted order.  This improves cache performance when loading
    // the grid values.
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
-        int m = pmeAtomGridIndex[i].x;
+        int m = pmeAtomGridIndex[atom].x;
        real4 pos = posq[m];
        pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
        pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
@@ -1051,7 +1051,7 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
            f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j1];
            f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j2];
            f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j3];
-#ifndef DIRECT_POLARIZATION
+#ifdef MUTUAL_POLARIZATION
            f.x += (inducedDipole[k]*phip[i+NUM_ATOMS*j1] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j1]);
            f.y += (inducedDipole[k]*phip[i+NUM_ATOMS*j2] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j2]);
            f.z += (inducedDipole[k]*phip[i+NUM_ATOMS*j3] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j3]);
@@ -1073,8 +1073,12 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.25f*EPSILON_FACTOR*energy;
 }
-extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip,
+extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip, long long* __restrict__ inducedField,
-        long long* __restrict__ inducedField, long long* __restrict__ inducedFieldPolar, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+        long long* __restrict__ inducedFieldPolar, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar,
+#ifdef EXTRAPOLATED_POLARIZATION
+        unsigned long long* __restrict__ fieldGradient, unsigned long long* __restrict__ fieldGradientPolar,
+#endif
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    __shared__ real fracToCart[3][3];
    if (threadIdx.x == 0) {
        fracToCart[0][0] = GRID_SIZE_X*recipBoxVecX.x;
@@ -1088,12 +1092,62 @@ extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ ph
        fracToCart[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
    }
    __syncthreads();
+    real selfDipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        inducedField[i] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[0][0] + phid[i+NUM_ATOMS*2]*fracToCart[0][1] + phid[i+NUM_ATOMS*3]*fracToCart[0][2]));
+        inducedField[i] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[0][0] + phid[i+NUM_ATOMS*2]*fracToCart[0][1] + phid[i+NUM_ATOMS*3]*fracToCart[0][2] - selfDipoleScale*inducedDipole[3*i]));
-        inducedField[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[1][0] + phid[i+NUM_ATOMS*2]*fracToCart[1][1] + phid[i+NUM_ATOMS*3]*fracToCart[1][2]));
+        inducedField[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[1][0] + phid[i+NUM_ATOMS*2]*fracToCart[1][1] + phid[i+NUM_ATOMS*3]*fracToCart[1][2] - selfDipoleScale*inducedDipole[3*i+1]));
-        inducedField[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[2][0] + phid[i+NUM_ATOMS*2]*fracToCart[2][1] + phid[i+NUM_ATOMS*3]*fracToCart[2][2]));
+        inducedField[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[2][0] + phid[i+NUM_ATOMS*2]*fracToCart[2][1] + phid[i+NUM_ATOMS*3]*fracToCart[2][2] - selfDipoleScale*inducedDipole[3*i+2]));
-        inducedFieldPolar[i] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[0][0] + phip[i+NUM_ATOMS*2]*fracToCart[0][1] + phip[i+NUM_ATOMS*3]*fracToCart[0][2]));
+        inducedFieldPolar[i] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[0][0] + phip[i+NUM_ATOMS*2]*fracToCart[0][1] + phip[i+NUM_ATOMS*3]*fracToCart[0][2] - selfDipoleScale*inducedDipolePolar[3*i]));
-        inducedFieldPolar[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[1][0] + phip[i+NUM_ATOMS*2]*fracToCart[1][1] + phip[i+NUM_ATOMS*3]*fracToCart[1][2]));
+        inducedFieldPolar[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[1][0] + phip[i+NUM_ATOMS*2]*fracToCart[1][1] + phip[i+NUM_ATOMS*3]*fracToCart[1][2] - selfDipoleScale*inducedDipolePolar[3*i+1]));
-        inducedFieldPolar[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[2][0] + phip[i+NUM_ATOMS*2]*fracToCart[2][1] + phip[i+NUM_ATOMS*3]*fracToCart[2][2]));
+        inducedFieldPolar[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[2][0] + phip[i+NUM_ATOMS*2]*fracToCart[2][1] + phip[i+NUM_ATOMS*3]*fracToCart[2][2] - selfDipoleScale*inducedDipolePolar[3*i+2]));
+#ifdef EXTRAPOLATED_POLARIZATION
+        // Compute and store the field gradients for later use.
+        real EmatD[3][3] = {
+            {phid[i+NUM_ATOMS*4], phid[i+NUM_ATOMS*7], phid[i+NUM_ATOMS*8]},
+            {phid[i+NUM_ATOMS*7], phid[i+NUM_ATOMS*5], phid[i+NUM_ATOMS*9]},
+            {phid[i+NUM_ATOMS*8], phid[i+NUM_ATOMS*9], phid[i+NUM_ATOMS*6]}
+        };
+        real Exx = 0, Eyy = 0, Ezz = 0, Exy = 0, Exz = 0, Eyz = 0;
+        for (int k = 0; k < 3; ++k) {
+            for (int l = 0; l < 3; ++l) {
+                Exx += fracToCart[0][k] * EmatD[k][l] * fracToCart[0][l];
+                Eyy += fracToCart[1][k] * EmatD[k][l] * fracToCart[1][l];
+                Ezz += fracToCart[2][k] * EmatD[k][l] * fracToCart[2][l];
+                Exy += fracToCart[0][k] * EmatD[k][l] * fracToCart[1][l];
+                Exz += fracToCart[0][k] * EmatD[k][l] * fracToCart[2][l];
+                Eyz += fracToCart[1][k] * EmatD[k][l] * fracToCart[2][l];
+            }
+        }
+        atomicAdd(&fieldGradient[6*i+0], static_cast<unsigned long long>((long long) (-Exx*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+1], static_cast<unsigned long long>((long long) (-Eyy*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+2], static_cast<unsigned long long>((long long) (-Ezz*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+3], static_cast<unsigned long long>((long long) (-Exy*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+4], static_cast<unsigned long long>((long long) (-Exz*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+5], static_cast<unsigned long long>((long long) (-Eyz*0x100000000)));
+        real EmatP[3][3] = {
+            {phip[i+NUM_ATOMS*4], phip[i+NUM_ATOMS*7], phip[i+NUM_ATOMS*8]},
+            {phip[i+NUM_ATOMS*7], phip[i+NUM_ATOMS*5], phip[i+NUM_ATOMS*9]},
+            {phip[i+NUM_ATOMS*8], phip[i+NUM_ATOMS*9], phip[i+NUM_ATOMS*6]}
+        };
+        Exx = 0; Eyy = 0; Ezz = 0; Exy = 0; Exz = 0; Eyz = 0;
+        for (int k = 0; k < 3; ++k) {
+            for (int l = 0; l < 3; ++l) {
+                Exx += fracToCart[0][k] * EmatP[k][l] * fracToCart[0][l];
+                Eyy += fracToCart[1][k] * EmatP[k][l] * fracToCart[1][l];
+                Ezz += fracToCart[2][k] * EmatP[k][l] * fracToCart[2][l];
+                Exy += fracToCart[0][k] * EmatP[k][l] * fracToCart[1][l];
+                Exz += fracToCart[0][k] * EmatP[k][l] * fracToCart[2][l];
+                Eyz += fracToCart[1][k] * EmatP[k][l] * fracToCart[2][l];
+            }
+        }
+        atomicAdd(&fieldGradientPolar[6*i+0], static_cast<unsigned long long>((long long) (-Exx*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+1], static_cast<unsigned long long>((long long) (-Eyy*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+2], static_cast<unsigned long long>((long long) (-Ezz*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+3], static_cast<unsigned long long>((long long) (-Exy*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+4], static_cast<unsigned long long>((long long) (-Exz*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+5], static_cast<unsigned long long>((long long) (-Eyz*0x100000000)));
+#endif
    }
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
@@ -365,7 +365,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
    real iEIY = qiUinpI.x*Vijp[1] + qiUindI.x*Vijd[1] - qiUinpI.y*Vijp[0] - qiUindI.y*Vijd[0];
    real iEJY = qiUinpJ.x*Vjip[1] + qiUindJ.x*Vjid[1] - qiUinpJ.y*Vjip[0] - qiUindJ.y*Vjid[0];
-#ifdef USE_MUTUAL_POLARIZATION
+#ifdef MUTUAL_POLARIZATION
    // Uind-Uind terms (m=0)
    real eCoef = -fourThirds*rInvVec[3]*(3*(thole_d0 + bVec[3]) + alphaRVec[3]*X);
    real dCoef = rInvVec[4]*(6*(dthole_d0 + bVec[3]) + 4*alphaRVec[5]*X);