Merge remote-tracking branch 'upstream/master' into forcefield

39120086 · John Chodera (MSKCC) · 87b95319 · 2acba7ad · 39120086 · 39120086
Commit 39120086 authored Jan 18, 2016 by John Chodera (MSKCC)
20 changed files
--- a/README.md
+++ b/README.md
@@ -18,8 +18,6 @@ Need Help? Check out the [documentation](https://simtk.org/docman/?group_id=161)
 Badges
 ------
 * Travis CI `linux` integration tests: [![Build Status](https://travis-ci.org/pandegroup/openmm.png?branch=master)](https://travis-ci.org/pandegroup/openmm)
-* Jenkins `openmm-dev` conda `osx` build: [![Jenkins `openmm-dev` conda `osx` build](https://jenkins.choderalab.org/job/conda-openmm-dev-osx-2/badge/icon)](https://jenkins.choderalab.org/job/conda-openmm-dev-osx-2/) [[console log]](https://jenkins.choderalab.org/job/conda-openmm-dev-osx-2/lastBuild/consoleFull)
+* Anaconda Cloud `openmm` conda release: ![Binstar `openmm` conda release](https://binstar.org/omnia/openmm/badges/version.svg)
-* Jenkins `openmm-dev` conda `linux` build: [![Jenkins `openmm-dev` conda `linux` build](https://jenkins.choderalab.org/job/conda-openmm-dev-linux-vagrant-2/badge/icon)](https://jenkins.choderalab.org/job/conda-openmm-dev-linux-vagrant-2/) [[console log]](https://jenkins.choderalab.org/job/conda-openmm-dev-linux-vagrant-2/lastBuild/consoleFull)
+* Anaconda Cloud `openmm-dev` conda package: ![Binstar `openmm-dev` conda package](https://binstar.org/omnia/openmm-dev/badges/version.svg)
-* Binstar `openmm` conda release: ![Binstar `openmm` conda release](https://binstar.org/omnia/openmm/badges/version.svg)
-* Binstar `openmm-dev` conda package: ![Binstar `openmm-dev` conda package](https://binstar.org/omnia/openmm-dev/badges/version.svg)
--- a/docs-source/usersguide/application.rst
+++ b/docs-source/usersguide/application.rst
@@ -922,29 +922,33 @@ If :code:`vdwCutoff` is not specified, then the value of
 Specifying the Polarization Method
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-OpenMM allows the setting of several other parameters particular to the AMOEBA
+When using the AMOEBA force field, OpenMM allows the induced dipoles to be
-force field.  The :code:`mutualInducedTargetEpsilon` option allows you to
+calculated in any of three different ways.  The slowest but potentially most
-specify the accuracy to which the induced dipoles are calculated at each time
+accurate method is to iterate the calculation until the dipoles converge to a
-step; the default value is 0.01.  The :code:`polarization` setting
+specified tolerance.  To select this, specify :code:`polarization='mutual'`.
-determines whether the calculation of the induced dipoles is continued until the
+Use the :code:`mutualInducedTargetEpsilon` option to select the tolerance; for
-dipoles are self-consistent to within the tolerance specified by
+most situations, a value of 0.00001 works well.  Alternatively you can specify
-:code:`mutualInducedTargetEpsilon` or whether a quick estimate of the induced
+:code:`polarization='extrapolated'`.  This uses an analytic approximation
-dipoles is used instead.  The first option corresponds to the
+:cite:`Simmonett2015` to estimate what the fully converged dipoles will be without
-:code:`polarization='mutual'` setting and is the default; the quick estimate
+actually continuing the calculation to convergence.  In many cases this can be
-option is given by :code:`polarization='direct'` and in this case,
+significantly faster with only a small loss in accuracy.  Finally, you can
-:code:`mutualInducedTargetEpsilon` is ignored, if provided.  Simulations using
+specify :code:`polarization='direct'` to use the direct polarization
-:code:`polarization='direct'` will be significantly faster than those with
+approximation, in which induced dipoles depend only on the fixed multipoles, not
-:code:`polarization='mutual'`\ , but less accurate.  Examples using the two
+on other induced dipoles.  This is even faster, but it produces very different
-options are given below:
+forces from mutual polarization, so it should only be used with force fields
-::
+that have been specifically parameterized for use with this approximation.
-    system = forcefield.createSystem(nonbondedMethod=PME,
+Here are examples of using each method:
-        nonbondedCutoff=1*nanometer,ewaldErrorTolerance=0.00001,
+::
-        vdwCutoff=1.2*nanometer, mutualInducedTargetEpsilon=0.01)
+    system = forcefield.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
-    system = forcefield.createSystem(nonbondedMethod=PME,
+        vdwCutoff=1.2*nanometer, polarization='mutual', mutualInducedTargetEpsilon=0.00001)
-        nonbondedCutoff=1*nanometer,ewaldErrorTolerance=0.00001,
-        vdwCutoff=1.2*nanometer, polarization ='direct')
+    system = forcefield.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
+        vdwCutoff=1.2*nanometer, polarization='extrapolated')
+    system = forcefield.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
+        vdwCutoff=1.2*nanometer, polarization='direct')
 Implicit Solvent and Solute Dielectrics

--- a/docs-source/usersguide/references.bib
+++ b/docs-source/usersguide/references.bib
@@ -389,6 +389,17 @@
   type = {Journal Article}
 }
+@article{Simmonett2015
+   author = {Simmonett, Andrew C. and Pickard, Frank C. and Shao, Yihan and Cheatham, Thomas E. and Brooks, Bernard R.},
+   title = {Efficient treatment of induced dipoles},
+   journal = {Journal of Chemical Physics},
+   year = {2015},
+   volume = {143},
+   number = {7},
+   pages = {074115},
+   type = {Journal Article}
+}
 @article{Sindhikara2009,
  author =   {Sindhikara, Daniel J. and Kim, Seonah and Voter,
                  Arthur F. and Roitberg, Adrian E.},

--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -36,22 +36,18 @@ def runOneTest(testName, options):
    if amoeba:
        constraints = None
        epsilon = float(options.epsilon)
-        if epsilon == 0:
-            polarization = 'direct'
-        else:
-            polarization = 'mutual'
        if explicit:
            ff = app.ForceField('amoeba2009.xml')
            pdb = app.PDBFile('5dfr_solv-cube_equil.pdb')
            cutoff = 0.7*unit.nanometers
            vdwCutoff = 0.9*unit.nanometers
-            system = ff.createSystem(pdb.topology, nonbondedMethod=app.PME, nonbondedCutoff=cutoff, vdwCutoff=vdwCutoff, constraints=constraints, ewaldErrorTolerance=0.00075, mutualInducedTargetEpsilon=epsilon, polarization=polarization)
+            system = ff.createSystem(pdb.topology, nonbondedMethod=app.PME, nonbondedCutoff=cutoff, vdwCutoff=vdwCutoff, constraints=constraints, ewaldErrorTolerance=0.00075, mutualInducedTargetEpsilon=epsilon, polarization=options.polarization)
        else:
            ff = app.ForceField('amoeba2009.xml', 'amoeba2009_gk.xml')
            pdb = app.PDBFile('5dfr_minimized.pdb')
            cutoff = 2.0*unit.nanometers
            vdwCutoff = 1.2*unit.nanometers
-            system = ff.createSystem(pdb.topology, nonbondedMethod=app.NoCutoff, constraints=constraints, mutualInducedTargetEpsilon=epsilon, polarization=polarization)
+            system = ff.createSystem(pdb.topology, nonbondedMethod=app.NoCutoff, constraints=constraints, mutualInducedTargetEpsilon=epsilon, polarization=options.polarization)
        for f in system.getForces():
            if isinstance(f, mm.AmoebaMultipoleForce) or isinstance(f, mm.AmoebaVdwForce) or isinstance(f, mm.AmoebaGeneralizedKirkwoodForce) or isinstance(f, mm.AmoebaWcaDispersionForce):
                f.setForceGroup(1)
@@ -127,7 +123,8 @@ parser.add_option('--platform', dest='platform', choices=platformNames, help='na
 parser.add_option('--test', dest='test', choices=('gbsa', 'rf', 'pme', 'amoebagk', 'amoebapme'), help='the test to perform: gbsa, rf, pme, amoebagk, or amoebapme [default: all]')
 parser.add_option('--pme-cutoff', default='0.9', dest='cutoff', type='float', help='direct space cutoff for PME in nm [default: 0.9]')
 parser.add_option('--seconds', default='60', dest='seconds', type='float', help='target simulation length in seconds [default: 60]')
-parser.add_option('--mutual-epsilon', default='1e-4', dest='epsilon', type='float', help='mutual induced epsilon for AMOEBA [default: 1e-4]')
+parser.add_option('--polarization', default='mutual', dest='polarization', choices=('direct', 'extrapolated', 'mutual'), help='the polarization method for AMOEBA: direct, extrapolated, or mutual [default: mutual]')
+parser.add_option('--mutual-epsilon', default='1e-5', dest='epsilon', type='float', help='mutual induced epsilon for AMOEBA [default: 1e-5]')
 parser.add_option('--heavy-hydrogens', action='store_true', default=False, dest='heavy', help='repartition mass to allow a larger time step')
 parser.add_option('--device', default=None, dest='device', help='device index for CUDA or OpenCL')
 parser.add_option('--precision', default='single', dest='precision', choices=('single', 'mixed', 'double'), help='precision mode for CUDA or OpenCL: single, mixed, or double [default: single]')

--- a/openmmapi/src/ContextImpl.cpp
+++ b/openmmapi/src/ContextImpl.cpp
@@ -449,4 +449,5 @@ void ContextImpl::loadCheckpoint(istream& stream) {
        parameters[name] = value;
    }
    updateStateDataKernel.getAs<UpdateStateDataKernel>().loadCheckpoint(*this, stream);
+    hasSetPositions = true;
 }
--- a/platforms/cpu/tests/TestCpuCheckpoints.cpp
+++ b/platforms/cpu/tests/TestCpuCheckpoints.cpp
@@ -92,6 +92,15 @@ void testCheckpoint() {
    integrator.step(10);
    State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
    compareStates(s2, s4);
+    // See if a checkpoint created from one Context can be loaded into a different one.
+    VerletIntegrator integrator2(0.001);
+    Context context2(system, integrator2, platform);
+    stream1.seekg(0, stream1.beg);
+    context2.loadCheckpoint(stream1);
+    State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s5);
 }
 void runPlatformTests() {

--- a/platforms/cuda/tests/TestCudaCheckpoints.cpp
+++ b/platforms/cuda/tests/TestCudaCheckpoints.cpp
@@ -120,6 +120,15 @@ void testCheckpoint() {
    integrator2.step(10);
    State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
    compareStates(s6, s8);
+    // See if a checkpoint created from one Context can be loaded into a different one.
+    VerletIntegrator integrator3(0.001);
+    Context context3(system, integrator3, platform);
+    stream1.seekg(0, stream1.beg);
+    context3.loadCheckpoint(stream1);
+    State s9 = context3.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s9);
 }
 void runPlatformTests() {

--- a/platforms/opencl/tests/TestOpenCLCheckpoints.cpp
+++ b/platforms/opencl/tests/TestOpenCLCheckpoints.cpp
@@ -120,6 +120,15 @@ void testCheckpoint() {
    integrator2.step(10);
    State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
    compareStates(s6, s8);
+    // See if a checkpoint created from one Context can be loaded into a different one.
+    VerletIntegrator integrator3(0.001);
+    Context context3(system, integrator3, platform);
+    stream1.seekg(0, stream1.beg);
+    context3.loadCheckpoint(stream1);
+    State s9 = context3.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s9);
 }
 void runPlatformTests() {

--- a/platforms/reference/tests/TestReferenceCheckpoints.cpp
+++ b/platforms/reference/tests/TestReferenceCheckpoints.cpp
@@ -92,6 +92,15 @@ void testCheckpoint() {
    integrator.step(10);
    State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
    compareStates(s2, s4);
+    // See if a checkpoint created from one Context can be loaded into a different one.
+    VerletIntegrator integrator2(0.001);
+    Context context2(system, integrator2, platform);
+    stream1.seekg(0, stream1.beg);
+    context2.loadCheckpoint(stream1);
+    State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s5);
 }
 void runPlatformTests() {

--- a/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
 * Authors: Mark Friedrichs, Peter Eastman                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -72,14 +72,25 @@ public:
    enum PolarizationType {
        /**
-         * Mutual polarization
+         * Full mutually induced polarization.  The dipoles are iterated until the converge to the accuracy specified
+         * by getMutualInducedTargetEpsilon().
         */
        Mutual = 0,
        /**
-         * Direct polarization
+         * Direct polarization approximation.  The induced dipoles depend only on the fixed multipoles, not on other
+         * induced dipoles.
         */
-        Direct = 1
+        Direct = 1,
+        /**
+         * Extrapolated perturbation theory approximation.  The dipoles are iterated a few times, and then an analytic
+         * approximation is used to extrapolate to the fully converged values.  Call setExtrapolationCoefficients()
+         * to set the coefficients used for the extrapolation.  The default coefficients used in this release are
+         * [0, -0.3, 0, 1.3], but be aware that those may change in a future release.
+         */
+        Extrapolated = 2
    };
    enum MultipoleAxisTypes { ZThenX = 0, Bisector = 1, ZBisect = 2, ThreeFold = 3, ZOnly = 4, NoAxisType = 5, LastAxisTypeIndex = 6 };
@@ -298,6 +309,23 @@ public:
     */
    void setMutualInducedTargetEpsilon(double inputMutualInducedTargetEpsilon);
+    /**
+     * Set the coefficients for the mu_0, mu_1, mu_2, ..., mu_n terms in the extrapolation
+     * algorithm for induced dipoles.
+     *
+     * @param coefficients      a vector whose mth entry specifies the coefficient for mu_m.  The length of this
+     *                          vector determines how many iterations are performed.
+     *
+     */
+    void setExtrapolationCoefficients(const std::vector<double> &coefficients);
+    /**
+     * Get the coefficients for the mu_0, mu_1, mu_2, ..., mu_n terms in the extrapolation
+     * algorithm for induced dipoles.  In this release, the default values for the coefficients are
+     * [0, -0.3, 0, 1.3], but be aware that those may change in a future release.
+     */
+    const std::vector<double>& getExtrapolationCoefficients() const;
    /**
     * Get the error tolerance for Ewald summation.  This corresponds to the fractional error in the forces
     * which is acceptable.  This value is used to select the grid dimensions and separation (alpha)
@@ -384,6 +412,8 @@ private:
    int pmeBSplineOrder;
    std::vector<int> pmeGridDimension;
    int mutualInducedMaxIterations;
+    std::vector<double> extrapolationCoefficients;
    double mutualInducedTargetEpsilon;
    double scalingDistanceCutoff;
    double electricConstant;

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
 * Authors:                                                                   *
 * Contributors:                                                              *
 *                                                                            *
@@ -43,6 +43,10 @@ AmoebaMultipoleForce::AmoebaMultipoleForce() : nonbondedMethod(NoCutoff), polari
                                               mutualInducedTargetEpsilon(1.0e-02), scalingDistanceCutoff(100.0), electricConstant(138.9354558456), aewald(0.0) {
    pmeGridDimension.resize(3);
    pmeGridDimension[0] = pmeGridDimension[1] = pmeGridDimension[2];
+    extrapolationCoefficients.push_back(0.0);
+    extrapolationCoefficients.push_back(-0.3);
+    extrapolationCoefficients.push_back(0.0);
+    extrapolationCoefficients.push_back(1.3);
 }
 AmoebaMultipoleForce::NonbondedMethod AmoebaMultipoleForce::getNonbondedMethod() const {
@@ -61,6 +65,14 @@ void AmoebaMultipoleForce::setPolarizationType(AmoebaMultipoleForce::Polarizatio
    polarizationType = type;
 }
+void AmoebaMultipoleForce::setExtrapolationCoefficients(const std::vector<double> &coefficients) {
+    extrapolationCoefficients = coefficients;
+}
+const std::vector<double> & AmoebaMultipoleForce::getExtrapolationCoefficients() const {
+    return extrapolationCoefficients;
+}
 double AmoebaMultipoleForce::getCutoffDistance() const {
    return cutoffDistance;
 }

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -385,14 +385,17 @@ private:
        const char* getSortKey() const {return "value.y";}
    };
    void initializeScaleFactors();
+    void computeInducedField(void** recipBoxVectorPointer);
    bool iterateDipolesByDIIS(int iteration);
+    void computeExtrapolatedDipoles(void** recipBoxVectorPointer);
    void ensureMultipolesValid(ContextImpl& context);
    template <class T, class T4, class M4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments);
-    int numMultipoles, maxInducedIterations;
+    int numMultipoles, maxInducedIterations, maxExtrapolationOrder;
    int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
    double alpha, inducedEpsilon;
    bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid;
+    AmoebaMultipoleForce::PolarizationType polarizationType;
    CudaContext& cu;
    const System& system;
    std::vector<int3> covalentFlagValues;
@@ -422,6 +425,18 @@ private:
    CudaArray* prevErrors;
    CudaArray* diisMatrix;
    CudaArray* diisCoefficients;
+    CudaArray* extrapolatedDipole;
+    CudaArray* extrapolatedDipolePolar;
+    CudaArray* extrapolatedDipoleGk;
+    CudaArray* extrapolatedDipoleGkPolar;
+    CudaArray* inducedDipoleFieldGradient;
+    CudaArray* inducedDipoleFieldGradientPolar;
+    CudaArray* inducedDipoleFieldGradientGk;
+    CudaArray* inducedDipoleFieldGradientGkPolar;
+    CudaArray* extrapolatedDipoleFieldGradient;
+    CudaArray* extrapolatedDipoleFieldGradientPolar;
+    CudaArray* extrapolatedDipoleFieldGradientGk;
+    CudaArray* extrapolatedDipoleFieldGradientGkPolar;
    CudaArray* polarizability;
    CudaArray* covalentFlags;
    CudaArray* polarizationGroupFlags;
@@ -444,6 +459,7 @@ private:
    CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
    CUfunction recordDIISDipolesKernel, buildMatrixKernel;
+    CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
    CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
    CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
    static const int PmeOrder = 5;
@@ -512,6 +528,7 @@ private:
    const System& system;
    bool includeSurfaceArea, hasInitializedKernels;
    int computeBornSumThreads, gkForceThreads, chainRuleThreads, ediffThreads;
+    AmoebaMultipoleForce::PolarizationType polarizationType;
    std::map<std::string, std::string> defines;
    CudaArray* params;
    CudaArray* bornSum;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaInPlaneForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaInPlaneForce.cu
@@ -39,15 +39,23 @@ real ym = zcp*xap - xcp*zap;
 real zm = xcp*yap - ycp*xap;
 real rm = max(SQRT(xm*xm + ym*ym + zm*zm), (real) 1e-6f);
-real dot = xap*xcp + yap*ycp + zap*zcp;
+real dotp = xap*xcp + yap*ycp + zap*zcp;
 real product = SQRT(rap2*rcp2);
-real cosine = (product > 0 ? (dot/product) : 0);
+real cosine = (product > 0 ? (dotp/product) : 0);
 cosine = max(min(cosine, (real) 1), (real) -1);
-real angle = ACOS(cosine);
+real angle;
+if (cosine > 0.99f || cosine < -0.99f) {
+    real3 cross_prod = cross(make_real3(xap, yap, zap), make_real3(xcp, ycp, zcp));
+    angle = ASIN(SQRT(dot(cross_prod, cross_prod)/(rap2*rcp2)))*RAD_TO_DEG;
+    if (cosine < 0.0f)
+        angle = 180-angle;
+}
+else
+    angle = ACOS(cosine)*RAD_TO_DEG;
 // if product == 0, set force/energy to 0
-real deltaIdeal = (product > 0 ? (angle*RAD_TO_DEG - angleParams.x) : 0);
+real deltaIdeal = (product > 0 ? (angle - angleParams.x) : 0);
 real deltaIdeal2 = deltaIdeal*deltaIdeal;
 real deltaIdeal3 = deltaIdeal*deltaIdeal2;
 real deltaIdeal4 = deltaIdeal2*deltaIdeal2;
@@ -111,4 +119,4 @@ real dedzid = -dedzia - dedzib - dedzic;
 real3 force1 = make_real3(-dedxia, -dedyia, -dedzia);
 real3 force2 = make_real3(-dedxib, -dedyib, -dedzib);
 real3 force3 = make_real3(-dedxic, -dedyic, -dedzic);
 real3 force4 = make_real3(-dedxid, -dedyid, -dedzid);
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaStretchBendForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaStretchBendForce.cu
@@ -17,18 +17,27 @@ real zp = xcb*yab - ycb*xab;
 real rp = SQRT(xp*xp + yp*yp + zp*zp);
-real dot = xab*xcb + yab*ycb + zab*zcb;
+real dotp = xab*xcb + yab*ycb + zab*zcb;
-real cosine = rab*rcb > 0 ? (dot / (rab*rcb)) : (real) 1;
+real cosine = rab*rcb > 0 ? (dotp / (rab*rcb)) : (real) 1;
 cosine = (cosine > 1 ? (real) 1 : cosine);
 cosine = (cosine < -1 ? -(real) 1 : cosine);
-real angle = ACOS(cosine);
+real angle;
+if (cosine > 0.99f || cosine < -0.99f) {
+    // Highly unlikely a stretch-bend angle will be near 0 or 180, but just in case...
+    real3 cross_prod = cross(make_real3(xab, yab, zab), make_real3(xcb, ycb, zcb));
+    angle = ASIN(SQRT(dot(cross_prod, cross_prod))/(rab*rcb))*RAD_TO_DEG;
+    if (cosine < 0.0f)
+        angle = 180-angle;
+}
+else
+    angle = ACOS(cosine)*RAD_TO_DEG;
 // find chain rule terms for the bond angle deviation
 float3 parameters = PARAMS[index];
 float2 force_constants = FORCE_CONSTANTS[index];
-real dt = RAD_TO_DEG*(angle - parameters.z);
+real dt = angle - RAD_TO_DEG*parameters.z;
 real terma = rab*rp != 0 ? (-RAD_TO_DEG/(rab*rab*rp)) : (real) 0;
 real termc = rcb*rp != 0 ? (RAD_TO_DEG/(rcb*rcb*rp)) : (real) 0;

--- a/plugins/amoeba/platforms/cuda/src/kernels/gkEDiffPairForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/gkEDiffPairForce.cu
@@ -365,7 +365,22 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    // correction to convert mutual to direct polarization force
-#ifdef DIRECT_POLARIZATION
+#ifdef MUTUAL_POLARIZATION
+    real findmp1 = uscale*(scip2*ddsc3_1 - ddsc5_1*(sci3*scip4+scip3*sci4));
+    real findmp2 = uscale*(scip2*ddsc3_2 - ddsc5_2*(sci3*scip4+scip3*sci4));
+    real findmp3 = uscale*(scip2*ddsc3_3 - ddsc5_3*(sci3*scip4+scip3*sci4));
+    ftm2i1 -= 0.5f*findmp1;
+    ftm2i2 -= 0.5f*findmp2;
+    ftm2i3 -= 0.5f*findmp3;
+    real sci3X = sci3 - sci3Y;
+    real sci4X = sci4 - sci4Y;
+    real scip3X = scip3 - scip3Y;
+    real scip4X = scip4 - scip4Y;
+    ftm2i1 += 0.5f*uscale*(-ddsc5_1*(sci3X*scip4X+scip3X*sci4X));
+    ftm2i2 += 0.5f*uscale*(-ddsc5_2*(sci3X*scip4X+scip3X*sci4X));
+    ftm2i3 += 0.5f*uscale*(-ddsc5_3*(sci3X*scip4X+scip3X*sci4X)); 
+#else
    real gfd = (scip2*scale3i - 5*rr2*(scip3*sci4+sci3*scip4)*scale5i);
    real fdir1 = gfd*xr + scale5i* (sci4*atom1.inducedDipolePolarS.x+scip4*atom1.inducedDipoleS.x + sci3*atom2.inducedDipolePolarS.x+scip3*atom2.inducedDipoleS.x);
    real fdir2 = gfd*yr + scale5i* (sci4*atom1.inducedDipolePolarS.y+scip4*atom1.inducedDipoleS.y + sci3*atom2.inducedDipolePolarS.y+scip3*atom2.inducedDipoleS.y);
@@ -385,21 +400,6 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    ftm2i1 += 0.5f*fdir1;
    ftm2i2 += 0.5f*fdir2;
    ftm2i3 += 0.5f*fdir3;
-#else
-    real findmp1 = uscale*(scip2*ddsc3_1 - ddsc5_1*(sci3*scip4+scip3*sci4));
-    real findmp2 = uscale*(scip2*ddsc3_2 - ddsc5_2*(sci3*scip4+scip3*sci4));
-    real findmp3 = uscale*(scip2*ddsc3_3 - ddsc5_3*(sci3*scip4+scip3*sci4));
-    ftm2i1 -= 0.5f*findmp1;
-    ftm2i2 -= 0.5f*findmp2;
-    ftm2i3 -= 0.5f*findmp3;
-    real sci3X = sci3 - sci3Y;
-    real sci4X = sci4 - sci4Y;
-    real scip3X = scip3 - scip3Y;
-    real scip4X = scip4 - scip4Y;
-    ftm2i1 += 0.5f*uscale*(-ddsc5_1*(sci3X*scip4X+scip3X*sci4X));
-    ftm2i2 += 0.5f*uscale*(-ddsc5_2*(sci3X*scip4X+scip3X*sci4X));
-    ftm2i3 += 0.5f*uscale*(-ddsc5_3*(sci3X*scip4X+scip3X*sci4X)); 
 #endif
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
@@ -327,7 +327,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
    real iEIY = qiUinpI.x*Vijp[1] + qiUindI.x*Vijd[1] - qiUinpI.y*Vijp[0] - qiUindI.y*Vijd[0];
    real iEJY = qiUinpJ.x*Vjip[1] + qiUindJ.x*Vjid[1] - qiUinpJ.y*Vjip[0] - qiUindJ.y*Vjid[0];
-#ifdef USE_MUTUAL_POLARIZATION
+#ifdef MUTUAL_POLARIZATION
    // Uind-Uind terms (m=0)
    real eCoef = -4*rInvVec[3]*thole_d0;
    real dCoef = 6*rInvVec[4]*dthole_d0;

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
@@ -643,8 +643,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
    // Process the atoms in spatially sorted order.  This improves cache performance when loading
    // the grid values.
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
-        int m = pmeAtomGridIndex[i].x;
+        int m = pmeAtomGridIndex[atom].x;
        real4 pos = posq[m];
        pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
        pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
@@ -1051,7 +1051,7 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
            f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j1];
            f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j2];
            f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[i+NUM_ATOMS*j3];
-#ifndef DIRECT_POLARIZATION
+#ifdef MUTUAL_POLARIZATION
            f.x += (inducedDipole[k]*phip[i+NUM_ATOMS*j1] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j1]);
            f.y += (inducedDipole[k]*phip[i+NUM_ATOMS*j2] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j2]);
            f.z += (inducedDipole[k]*phip[i+NUM_ATOMS*j3] + inducedDipolePolar[k]*phid[i+NUM_ATOMS*j3]);
@@ -1073,8 +1073,12 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.25f*EPSILON_FACTOR*energy;
 }
-extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip,
+extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip, long long* __restrict__ inducedField,
-        long long* __restrict__ inducedField, long long* __restrict__ inducedFieldPolar, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+        long long* __restrict__ inducedFieldPolar, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar,
+#ifdef EXTRAPOLATED_POLARIZATION
+        unsigned long long* __restrict__ fieldGradient, unsigned long long* __restrict__ fieldGradientPolar,
+#endif
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    __shared__ real fracToCart[3][3];
    if (threadIdx.x == 0) {
        fracToCart[0][0] = GRID_SIZE_X*recipBoxVecX.x;
@@ -1088,12 +1092,62 @@ extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ ph
        fracToCart[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
    }
    __syncthreads();
+    real selfDipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        inducedField[i] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[0][0] + phid[i+NUM_ATOMS*2]*fracToCart[0][1] + phid[i+NUM_ATOMS*3]*fracToCart[0][2]));
+        inducedField[i] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[0][0] + phid[i+NUM_ATOMS*2]*fracToCart[0][1] + phid[i+NUM_ATOMS*3]*fracToCart[0][2] - selfDipoleScale*inducedDipole[3*i]));
-        inducedField[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[1][0] + phid[i+NUM_ATOMS*2]*fracToCart[1][1] + phid[i+NUM_ATOMS*3]*fracToCart[1][2]));
+        inducedField[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[1][0] + phid[i+NUM_ATOMS*2]*fracToCart[1][1] + phid[i+NUM_ATOMS*3]*fracToCart[1][2] - selfDipoleScale*inducedDipole[3*i+1]));
-        inducedField[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[2][0] + phid[i+NUM_ATOMS*2]*fracToCart[2][1] + phid[i+NUM_ATOMS*3]*fracToCart[2][2]));
+        inducedField[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phid[i+NUM_ATOMS]*fracToCart[2][0] + phid[i+NUM_ATOMS*2]*fracToCart[2][1] + phid[i+NUM_ATOMS*3]*fracToCart[2][2] - selfDipoleScale*inducedDipole[3*i+2]));
-        inducedFieldPolar[i] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[0][0] + phip[i+NUM_ATOMS*2]*fracToCart[0][1] + phip[i+NUM_ATOMS*3]*fracToCart[0][2]));
+        inducedFieldPolar[i] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[0][0] + phip[i+NUM_ATOMS*2]*fracToCart[0][1] + phip[i+NUM_ATOMS*3]*fracToCart[0][2] - selfDipoleScale*inducedDipolePolar[3*i]));
-        inducedFieldPolar[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[1][0] + phip[i+NUM_ATOMS*2]*fracToCart[1][1] + phip[i+NUM_ATOMS*3]*fracToCart[1][2]));
+        inducedFieldPolar[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[1][0] + phip[i+NUM_ATOMS*2]*fracToCart[1][1] + phip[i+NUM_ATOMS*3]*fracToCart[1][2] - selfDipoleScale*inducedDipolePolar[3*i+1]));
-        inducedFieldPolar[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[2][0] + phip[i+NUM_ATOMS*2]*fracToCart[2][1] + phip[i+NUM_ATOMS*3]*fracToCart[2][2]));
+        inducedFieldPolar[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phip[i+NUM_ATOMS]*fracToCart[2][0] + phip[i+NUM_ATOMS*2]*fracToCart[2][1] + phip[i+NUM_ATOMS*3]*fracToCart[2][2] - selfDipoleScale*inducedDipolePolar[3*i+2]));
+#ifdef EXTRAPOLATED_POLARIZATION
+        // Compute and store the field gradients for later use.
+        real EmatD[3][3] = {
+            {phid[i+NUM_ATOMS*4], phid[i+NUM_ATOMS*7], phid[i+NUM_ATOMS*8]},
+            {phid[i+NUM_ATOMS*7], phid[i+NUM_ATOMS*5], phid[i+NUM_ATOMS*9]},
+            {phid[i+NUM_ATOMS*8], phid[i+NUM_ATOMS*9], phid[i+NUM_ATOMS*6]}
+        };
+        real Exx = 0, Eyy = 0, Ezz = 0, Exy = 0, Exz = 0, Eyz = 0;
+        for (int k = 0; k < 3; ++k) {
+            for (int l = 0; l < 3; ++l) {
+                Exx += fracToCart[0][k] * EmatD[k][l] * fracToCart[0][l];
+                Eyy += fracToCart[1][k] * EmatD[k][l] * fracToCart[1][l];
+                Ezz += fracToCart[2][k] * EmatD[k][l] * fracToCart[2][l];
+                Exy += fracToCart[0][k] * EmatD[k][l] * fracToCart[1][l];
+                Exz += fracToCart[0][k] * EmatD[k][l] * fracToCart[2][l];
+                Eyz += fracToCart[1][k] * EmatD[k][l] * fracToCart[2][l];
+            }
+        }
+        atomicAdd(&fieldGradient[6*i+0], static_cast<unsigned long long>((long long) (-Exx*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+1], static_cast<unsigned long long>((long long) (-Eyy*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+2], static_cast<unsigned long long>((long long) (-Ezz*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+3], static_cast<unsigned long long>((long long) (-Exy*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+4], static_cast<unsigned long long>((long long) (-Exz*0x100000000)));
+        atomicAdd(&fieldGradient[6*i+5], static_cast<unsigned long long>((long long) (-Eyz*0x100000000)));
+        real EmatP[3][3] = {
+            {phip[i+NUM_ATOMS*4], phip[i+NUM_ATOMS*7], phip[i+NUM_ATOMS*8]},
+            {phip[i+NUM_ATOMS*7], phip[i+NUM_ATOMS*5], phip[i+NUM_ATOMS*9]},
+            {phip[i+NUM_ATOMS*8], phip[i+NUM_ATOMS*9], phip[i+NUM_ATOMS*6]}
+        };
+        Exx = 0; Eyy = 0; Ezz = 0; Exy = 0; Exz = 0; Eyz = 0;
+        for (int k = 0; k < 3; ++k) {
+            for (int l = 0; l < 3; ++l) {
+                Exx += fracToCart[0][k] * EmatP[k][l] * fracToCart[0][l];
+                Eyy += fracToCart[1][k] * EmatP[k][l] * fracToCart[1][l];
+                Ezz += fracToCart[2][k] * EmatP[k][l] * fracToCart[2][l];
+                Exy += fracToCart[0][k] * EmatP[k][l] * fracToCart[1][l];
+                Exz += fracToCart[0][k] * EmatP[k][l] * fracToCart[2][l];
+                Eyz += fracToCart[1][k] * EmatP[k][l] * fracToCart[2][l];
+            }
+        }
+        atomicAdd(&fieldGradientPolar[6*i+0], static_cast<unsigned long long>((long long) (-Exx*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+1], static_cast<unsigned long long>((long long) (-Eyy*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+2], static_cast<unsigned long long>((long long) (-Ezz*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+3], static_cast<unsigned long long>((long long) (-Exy*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+4], static_cast<unsigned long long>((long long) (-Exz*0x100000000)));
+        atomicAdd(&fieldGradientPolar[6*i+5], static_cast<unsigned long long>((long long) (-Eyz*0x100000000)));
+#endif
    }
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
@@ -365,7 +365,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
    real iEIY = qiUinpI.x*Vijp[1] + qiUindI.x*Vijd[1] - qiUinpI.y*Vijp[0] - qiUindI.y*Vijd[0];
    real iEJY = qiUinpJ.x*Vjip[1] + qiUindJ.x*Vjid[1] - qiUinpJ.y*Vjip[0] - qiUindJ.y*Vjid[0];
-#ifdef USE_MUTUAL_POLARIZATION
+#ifdef MUTUAL_POLARIZATION
    // Uind-Uind terms (m=0)
    real eCoef = -fourThirds*rInvVec[3]*(3*(thole_d0 + bVec[3]) + alphaRVec[3]*X);
    real dCoef = rInvVec[4]*(6*(dthole_d0 + bVec[3]) + 4*alphaRVec[5]*X);