Merge remote-tracking branch 'upstream/master' into vagrant

c7aa1d00 · kyleabeauchamp · aea4e454 · f7127d33 · c7aa1d00 · c7aa1d00
Commit c7aa1d00 authored Jul 21, 2014 by kyleabeauchamp
20 changed files
--- a/.travis.yml
+++ b/.travis.yml
-language: c
+language: cpp
+compiler:
+  - clang
-install:
+before_install:
-  - source tools/ci/install.sh
+  - sudo apt-get update -qq
-  - export PYTHONUNBUFFERED=true
+  - sudo apt-get install -qq libpcre3 libpcre3-dev gromacs
+  - sudo apt-get install -qq swig doxygen llvm-3.3
+  - sudo apt-get install -qq python-numpy python-scipy python-pip
+  - sudo pip install nose
+  - export ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer-3.3
 script:
-  - export CC="clang++"
+  - cmake -DCMAKE_INSTALL_PREFIX=$HOME/OpenMM .
-  - source deactivate
+  - make -j2
-  - conda install --yes conda-build
+  - make -j2 install
-  - # Build the conda package, testing build before packaging.
+  - sudo make PythonInstall
-  - conda build tools/conda-recipe
+  - # run all of the tests
-  - # Install the conda package locally.
+  - ctest -j2 -V
-  - source activate $python
+  - # get a list of all of the failed tests into this stupid ctest format
-  - conda install $HOME/miniconda/conda-bld/linux-64/openmm-dev-*
+  - python -c 'fn = "Testing/Temporary/LastTestsFailed.log"; import os; os.path.exists(fn) or exit(0); l = [line.split(":")[0] for line in open(fn)]; triplets = zip(l, l, [","]*len(l)); print "".join(",".join(t) for t in triplets)' > FailedTests.log
-  - conda list -e
+  - # rerun all of the failed tests
-  - # Run the Python tests.
+  - if [ -s FailedTests.log ]; then ctest -V -I FailedTests.log; fi;
-  - pushd .
+  - # run the python tests too
-  - cd wrappers/python/tests
+  - cd python/tests
  - nosetests -vv --processes=-1 --process-timeout=200
-  - popd
-env:
-  global:
-    # encrypted BINSTAR_TOKEN for push of dev package to binstar
-    - secure: Qz3pEYXXFnNQ/WK+15ad4cdbLJvzgCIZRwKD9fLiS3CDO2ldAQWxzaz8RQOwqbFtZUWu7lQpr+GukNJz5p0w18QEto+BxLYG9aW5mjoc+F2vCjyWFjkwnJ/Z/3uBKTcr5x9Y7HKaPGivaJ4BNACifjt7cCpeVJzV6u2+bBgSoHc=
-  matrix:
-    - python=2.7  CONDA_PY=27
-    #- python=3.3  CONDA_PY=33
-after_success:
-  - echo "after_success"
-  - source tools/ci/after_success.sh
--- a/docs/usersguide/application.rst
+++ b/docs/usersguide/application.rst
@@ -651,19 +651,20 @@ For the main force field, OpenMM provides the following options:
 .. tabularcolumns:: |l|L|
-=================  ================================================================================
+=====================  ================================================================================
-File               Force Field                                                                     
+File                   Force Field                                                                     
-=================  ================================================================================
+=====================  ================================================================================
-amber96.xml        AMBER96\ :cite:`Kollman1997`
+amber96.xml            AMBER96\ :cite:`Kollman1997`
-amber99sb.xml      AMBER99\ :cite:`Wang2000` with modified backbone torsions\ :cite:`Hornak2006`
+amber99sb.xml          AMBER99\ :cite:`Wang2000` with modified backbone torsions\ :cite:`Hornak2006`
-amber99sbildn.xml  AMBER99SB plus improved side chain torsions\ :cite:`Lindorff-Larsen2010`
+amber99sbildn.xml      AMBER99SB plus improved side chain torsions\ :cite:`Lindorff-Larsen2010`
-amber99sbnmr.xml   AMBER99SB with modifications to fit NMR data\ :cite:`Li2010`
+amber99sbnmr.xml       AMBER99SB with modifications to fit NMR data\ :cite:`Li2010`
-amber03.xml        AMBER03\ :cite:`Duan2003`
+amber03.xml            AMBER03\ :cite:`Duan2003`
-amber10.xml        AMBER10
+amber10.xml            AMBER10
-amoeba2009.xml     AMOEBA 2009\ :cite:`Ren2002`.  This force field is deprecated.  It is 
+amoeba2009.xml         AMOEBA 2009\ :cite:`Ren2002`.  This force field is deprecated.  It is 
-                   recommended to use AMOEBA 2013 instead.
+                       recommended to use AMOEBA 2013 instead.
-amoeba2013.xml     AMOEBA 2013\ :cite:`Shi2013`
+amoeba2013.xml         AMOEBA 2013\ :cite:`Shi2013`
-=================  ================================================================================
+charmm_polar_2013.xml  CHARMM 2013 polarizable force field\ :cite:`Lopes2013`
+=====================  ================================================================================
 The AMBER files do not include parameters for water molecules.  This allows you
@@ -686,10 +687,10 @@ swm4ndp.xml  SWM4-NDP water model\ :cite:`Lamoureux2006`
 ===========  ============================================
-For the AMOEBA force field, only one explicit water model is currently available
+For the polarizable force fields (AMOEBA and CHARMM), only one explicit water model
-and the water parameters are included in the file :code:`amoeba2009.xml`\ .
+is currently available and the water parameters are included in the same file as
-Also the AMOEBA force field file only includes the parameters for amino acids
+the macromolecule parameters.  Also, the polarizable force fields only include
-and ions; nucleic acids will be included in a future release.
+parameters for amino acids and ions, not for nucleic acids.
 If you want to include an implicit solvation model, you can also specify one of
 the following files:

--- a/docs/usersguide/references.bib
+++ b/docs/usersguide/references.bib
@@ -215,6 +215,17 @@
   type = {Journal Article}
 }
+@article{Lopes2013,
+    author = {Lopes, Pedro E. M. and Huang, Jing and Shim, Jihyun and Luo, Yun and Li, Hui and Roux, Benoît and MacKerell, Alexander D.},
+    title = {Polarizable Force Field for Peptides and Proteins Based on the Classical Drude Oscillator},
+    journal = {Journal of Chemical Theory and Computation},
+    volume = {9},
+    number = {12},
+    pages = {5430-5449},
+    year = {2013},
+    type = {Journal Article}
+}
 @article{Mahoney2000
   author = {Mahoney, Michael W. and Jorgensen, William L.},
   title = {A five-site model for liquid water and the reproduction of the density anomaly by rigid, nonpolarizable potential functions},

--- a/openmmapi/include/openmm/CustomNonbondedForce.h
+++ b/openmmapi/include/openmm/CustomNonbondedForce.h
@@ -157,6 +157,7 @@ public:
     *                  of r, the distance between them, as well as any global and per-particle parameters
     */
    explicit CustomNonbondedForce(const std::string& energy);
+    CustomNonbondedForce(const CustomNonbondedForce& rhs); // copy constructor
    ~CustomNonbondedForce();
    /**
     * Get the number of particles for which force field parameters have been defined.
@@ -466,6 +467,7 @@ public:
 protected:
    ForceImpl* createImpl() const;
 private:
+    // REMEMBER TO UPDATE THE COPY CONSTRUCTOR IF YOU ADD ANY NEW FIELDS !!
    class ParticleInfo;
    class PerParticleParameterInfo;
    class GlobalParameterInfo;

--- a/openmmapi/include/openmm/TabulatedFunction.h
+++ b/openmmapi/include/openmm/TabulatedFunction.h
@@ -59,6 +59,7 @@ class OPENMM_EXPORT TabulatedFunction {
 public:
    virtual ~TabulatedFunction() {
    }
+    virtual TabulatedFunction* Copy() const = 0;
 };
 /**
@@ -96,6 +97,10 @@ public:
     * @param max            the value of x corresponding to the last element of values
     */
    void setFunctionParameters(const std::vector<double>& values, double min, double max);
+    /**
+     * Create a deep copy of the tabulated function.
+     */
+    Continuous1DFunction* Copy() const;
 private:
    std::vector<double> values;
    double min, max;
@@ -151,6 +156,10 @@ public:
     * @param ymax       the value of y corresponding to the last element of values
     */
    void setFunctionParameters(int xsize, int ysize, const std::vector<double>& values, double xmin, double xmax, double ymin, double ymax);
+    /**
+     * Create a deep copy of the tabulated function
+     */
+    Continuous2DFunction* Copy() const;
 private:
    std::vector<double> values;
    int xsize, ysize;
@@ -222,6 +231,10 @@ public:
     * @param zmax       the value of z corresponding to the last element of values
     */
    void setFunctionParameters(int xsize, int ysize, int zsize, const std::vector<double>& values, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax);
+    /**
+     * Create a deep copy of the tabulated function
+     */
+    Continuous3DFunction* Copy() const;
 private:
    std::vector<double> values;
    int xsize, ysize, zsize;
@@ -253,6 +266,10 @@ public:
     * @param values         the tabulated values of the function f(x)
     */
    void setFunctionParameters(const std::vector<double>& values);
+    /**
+     * Create a deep copy of the tabulated function
+     */
+    Discrete1DFunction* Copy() const;
 private:
    std::vector<double> values;
 };
@@ -291,6 +308,10 @@ public:
     *                  values[i+xsize*j] = f(i,j).  This must be of length xsize*ysize.
     */
    void setFunctionParameters(int xsize, int ysize, const std::vector<double>& values);
+    /**
+     * Create a deep copy of the tabulated function
+     */
+    Discrete2DFunction* Copy() const;
 private:
    int xsize, ysize;
    std::vector<double> values;
@@ -333,6 +354,10 @@ public:
     *                  values[i+xsize*j+xsize*ysize*k] = f(i,j,k).  This must be of length xsize*ysize*zsize.
     */
    void setFunctionParameters(int xsize, int ysize, int zsize, const std::vector<double>& values);
+    /**
+     * Create a deep copy of the tabulated function
+     */
+    Discrete3DFunction* Copy() const;
 private:
    int xsize, ysize, zsize;
    std::vector<double> values;

--- a/openmmapi/include/openmm/internal/CustomNonbondedForceImpl.h
+++ b/openmmapi/include/openmm/internal/CustomNonbondedForceImpl.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2014 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -67,7 +67,6 @@ public:
     */
    static double calcLongRangeCorrection(const CustomNonbondedForce& force, const Context& context);
 private:
-    class TabulatedFunction;
    static double integrateInteraction(Lepton::CompiledExpression& expression, const std::vector<double>& params1, const std::vector<double>& params2,
            const CustomNonbondedForce& force, const Context& context);
    const CustomNonbondedForce& owner;

--- a/openmmapi/src/CustomNonbondedForce.cpp
+++ b/openmmapi/src/CustomNonbondedForce.cpp
@@ -51,6 +51,23 @@ CustomNonbondedForce::CustomNonbondedForce(const string& energy) : energyExpress
    switchingDistance(-1.0), useSwitchingFunction(false), useLongRangeCorrection(false) {
 }
+CustomNonbondedForce::CustomNonbondedForce(const CustomNonbondedForce& rhs) {
+    // Copy everything and deep copy the tabulated functions
+    energyExpression = rhs.energyExpression;
+    nonbondedMethod = rhs.nonbondedMethod;
+    cutoffDistance = rhs.cutoffDistance;
+    switchingDistance = rhs.switchingDistance;
+    useSwitchingFunction = rhs.useSwitchingFunction;
+    useLongRangeCorrection = rhs.useLongRangeCorrection;
+    parameters = rhs.parameters;
+    globalParameters = rhs.globalParameters;
+    particles = rhs.particles;
+    exclusions = rhs.exclusions;
+    interactionGroups = rhs.interactionGroups;
+    for (vector<FunctionInfo>::const_iterator it = rhs.functions.begin(); it != rhs.functions.end(); it++)
+        functions.push_back(FunctionInfo(it->name, it->function->Copy()));
+}
 CustomNonbondedForce::~CustomNonbondedForce() {
    for (int i = 0; i < (int) functions.size(); i++)
        delete functions[i].function;

--- a/openmmapi/src/CustomNonbondedForceImpl.cpp
+++ b/openmmapi/src/CustomNonbondedForceImpl.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2014 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -37,7 +37,7 @@
 #include "openmm/internal/CustomNonbondedForceImpl.h"
 #include "openmm/internal/SplineFitter.h"
 #include "openmm/kernels.h"
-#include "lepton/CustomFunction.h"
+#include "ReferenceTabulatedFunction.h"
 #include "lepton/ParsedExpression.h"
 #include "lepton/Parser.h"
 #include <cmath>
@@ -137,38 +137,6 @@ void CustomNonbondedForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcCustomNonbondedForceKernel>().copyParametersToContext(context, owner);
 }
-class CustomNonbondedForceImpl::TabulatedFunction : public Lepton::CustomFunction {
-public:
-    TabulatedFunction(double min, double max, const vector<double>& values) :
-            min(min), max(max), values(values) {
-        int numValues = values.size();
-        x.resize(numValues);
-        for (int i = 0; i < numValues; i++)
-            x[i] = min+i*(max-min)/(numValues-1);
-        SplineFitter::createNaturalSpline(x, values, derivs);
-    }
-    int getNumArguments() const {
-        return 1;
-    }
-    double evaluate(const double* arguments) const {
-        double t = arguments[0];
-        if (t < min || t > max)
-            return 0.0;
-        return SplineFitter::evaluateSpline(x, values, derivs, t);
-    }
-    double evaluateDerivative(const double* arguments, const int* derivOrder) const {
-        double t = arguments[0];
-        if (t < min || t > max)
-            return 0.0;
-        return SplineFitter::evaluateSplineDerivative(x, values, derivs, t);
-    }
-    CustomFunction* clone() const {
-        return new TabulatedFunction(min, max, values);
-    }
-    double min, max;
-    vector<double> x, values, derivs;
-};
 double CustomNonbondedForceImpl::calcLongRangeCorrection(const CustomNonbondedForce& force, const Context& context) {
    if (force.getNonbondedMethod() == CustomNonbondedForce::NoCutoff || force.getNonbondedMethod() == CustomNonbondedForce::CutoffNonPeriodic)
        return 0.0;
@@ -176,13 +144,8 @@ double CustomNonbondedForceImpl::calcLongRangeCorrection(const CustomNonbondedFo
    // Parse the energy expression.
    map<string, Lepton::CustomFunction*> functions;
-    for (int i = 0; i < force.getNumFunctions(); i++) {
+    for (int i = 0; i < force.getNumFunctions(); i++)
-        string name;
+        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
-        vector<double> values;
-        double min, max;
-        force.getFunctionParameters(i, name, values, min, max);
-        functions[name] = new TabulatedFunction(min, max, values);
-    }
    Lepton::CompiledExpression expression = Lepton::Parser::parse(force.getEnergyFunction(), functions).createCompiledExpression();
    // Identify all particle classes (defined by parameters), and record the class of each particle.

--- a/openmmapi/src/TabulatedFunction.cpp
+++ b/openmmapi/src/TabulatedFunction.cpp
@@ -61,6 +61,13 @@ void Continuous1DFunction::setFunctionParameters(const vector<double>& values, d
    this->max = max;
 }
+Continuous1DFunction* Continuous1DFunction::Copy() const {
+    vector<double> new_vec(values.size());
+    for (size_t i = 0; i < values.size(); i++)
+        new_vec[i] = values[i];
+    return new Continuous1DFunction(new_vec, min, max);
+}
 Continuous2DFunction::Continuous2DFunction(int xsize, int ysize, const vector<double>& values, double xmin, double xmax, double ymin, double ymax) {
    if (xsize < 2 || ysize < 2)
        throw OpenMMException("Continuous2DFunction: must have at least two points along each axis");
@@ -107,6 +114,13 @@ void Continuous2DFunction::setFunctionParameters(int xsize, int ysize, const vec
    this->ymax = ymax;
 }
+Continuous2DFunction* Continuous2DFunction::Copy() const {
+    vector<double> new_vec(values.size());
+    for (size_t i = 0; i < values.size(); i++)
+        new_vec[i] = values[i];
+    return new Continuous2DFunction(xsize, ysize, new_vec, xmin, xmax, ymin, ymax);
+}
 Continuous3DFunction::Continuous3DFunction(int xsize, int ysize, int zsize, const vector<double>& values, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax) {
    if (xsize < 2 || ysize < 2 || zsize < 2)
        throw OpenMMException("Continuous3DFunction: must have at least two points along each axis");
@@ -166,6 +180,14 @@ void Continuous3DFunction::setFunctionParameters(int xsize, int ysize, int zsize
    this->zmax = zmax;
 }
+Continuous3DFunction* Continuous3DFunction::Copy() const {
+    vector<double> new_vec(values.size());
+    for (size_t i = 0; i < values.size(); i++)
+        new_vec[i] = values[i];
+    return new Continuous3DFunction(xsize, ysize, zsize, new_vec, xmin, xmax, ymin, ymax, zmin, zmax);
+}
 Discrete1DFunction::Discrete1DFunction(const vector<double>& values) {
    this->values = values;
 }
@@ -178,6 +200,13 @@ void Discrete1DFunction::setFunctionParameters(const vector<double>& values) {
    this->values = values;
 }
+Discrete1DFunction* Discrete1DFunction::Copy() const {
+    vector<double> new_vec(values.size());
+    for (size_t i = 0; i < values.size(); i++)
+        new_vec[i] = values[i];
+    return new Discrete1DFunction(new_vec);
+}
 Discrete2DFunction::Discrete2DFunction(int xsize, int ysize, const vector<double>& values) {
    if (values.size() != xsize*ysize)
        throw OpenMMException("Discrete2DFunction: incorrect number of values");
@@ -200,6 +229,13 @@ void Discrete2DFunction::setFunctionParameters(int xsize, int ysize, const vecto
    this->values = values;
 }
+Discrete2DFunction* Discrete2DFunction::Copy() const {
+    vector<double> new_vec(values.size());
+    for (size_t i = 0; i < values.size(); i++)
+        new_vec[i] = values[i];
+    return new Discrete2DFunction(xsize, ysize, new_vec);
+}
 Discrete3DFunction::Discrete3DFunction(int xsize, int ysize, int zsize, const vector<double>& values) {
    if (values.size() != xsize*ysize*zsize)
        throw OpenMMException("Discrete3DFunction: incorrect number of values");
@@ -224,3 +260,10 @@ void Discrete3DFunction::setFunctionParameters(int xsize, int ysize, int zsize,
    this->zsize = zsize;
    this->values = values;
 }
+Discrete3DFunction* Discrete3DFunction::Copy() const {
+    vector<double> new_vec(values.size());
+    for (size_t i = 0; i < values.size(); i++)
+        new_vec[i] = values[i];
+    return new Discrete3DFunction(xsize, ysize, zsize, new_vec);
+}
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -2271,7 +2271,7 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
        defines["USE_CUTOFF"] = "1";
    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
        defines["USE_PERIODIC"] = "1";
-    defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
+    defines["LOCAL_MEMORY_SIZE"] = cu.intToString(max(32, cu.getNonbondedUtilities().getForceThreadBlockSize()));
    double cutoff = force.getCutoffDistance();
    defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
-#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
 typedef struct {
    real x, y, z;
    real q;
@@ -19,7 +17,7 @@ extern "C" __global__ void computeInteractionGroups(
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
    real energy = 0.0f;
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ AtomData localData[LOCAL_MEMORY_SIZE];
    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
    const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
@@ -86,4 +84,4 @@ extern "C" __global__ void computeInteractionGroups(
        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 }
\ No newline at end of file
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -2289,7 +2289,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
        defines["USE_CUTOFF"] = "1";
    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
        defines["USE_PERIODIC"] = "1";
-    defines["THREAD_BLOCK_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
+    defines["LOCAL_MEMORY_SIZE"] = cl.intToString(max(32, cl.getNonbondedUtilities().getForceThreadBlockSize()));
    double cutoff = force.getCutoffDistance();
    defines["CUTOFF_SQUARED"] = cl.doubleToString(cutoff*cutoff);
    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());

--- a/platforms/opencl/src/kernels/customNonbondedGroups.cl
+++ b/platforms/opencl/src/kernels/customNonbondedGroups.cl
@@ -2,8 +2,6 @@
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
 #endif
-#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
 typedef struct {
    real x, y, z;
    real q;
@@ -52,7 +50,7 @@ __kernel void computeInteractionGroups(
    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = get_local_id(0) - tgx;           // block warpIndex
    real energy = 0.0f;
-    __local AtomData localData[THREAD_BLOCK_SIZE];
+    __local AtomData localData[LOCAL_MEMORY_SIZE];
    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
    const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
@@ -127,4 +125,4 @@ __kernel void computeInteractionGroups(
 #endif
    }
    energyBuffer[get_global_id(0)] += energy;
 }
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -40,6 +40,7 @@
 #include "CudaForceInfo.h"
 #include "CudaKernelSources.h"
 #include "CudaNonbondedUtilities.h"
+#include "jama_svd.h"
 #include <algorithm>
 #include <cmath>
@@ -796,8 +797,9 @@ private:
 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false),
        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL),
-        field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL),
+        field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
-        inducedDipole(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
+        diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
+        prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
        pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeAtomGridIndex(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
 }
@@ -832,6 +834,20 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete inducedDipolePolar;
    if (inducedDipoleErrors != NULL)
        delete inducedDipoleErrors;
+    if (prevDipoles != NULL)
+        delete prevDipoles;
+    if (prevDipolesPolar != NULL)
+        delete prevDipolesPolar;
+    if (prevDipolesGk != NULL)
+        delete prevDipolesGk;
+    if (prevDipolesGkPolar != NULL)
+        delete prevDipolesGkPolar;
+    if (prevErrors != NULL)
+        delete prevErrors;
+    if (diisMatrix != NULL)
+        delete diisMatrix;
+    if (diisCoefficients != NULL)
+        delete diisCoefficients;
    if (polarizability != NULL)
        delete polarizability;
    if (covalentFlags != NULL)
@@ -959,6 +975,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    inducedDipole = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipole");
    inducedDipolePolar = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipolePolar");
    inducedDipoleErrors = new CudaArray(cu, cu.getNumThreadBlocks(), sizeof(float2), "inducedDipoleErrors");
+    prevDipoles = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipoles");
+    prevDipolesPolar = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesPolar");
+    prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
+    diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
+    diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
    cu.addAutoclearBuffer(*field);
    cu.addAutoclearBuffer(*fieldPolar);
    cu.addAutoclearBuffer(*torque);
@@ -1088,6 +1109,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        defines["GK_FQ"] = cu.doubleToString(3*(1-solventDielectric)/(2+3*solventDielectric));
        fixedThreadMemory += 4*elementSize;
        inducedThreadMemory += 13*elementSize;
+        prevDipolesGk = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesGk");
+        prevDipolesGkPolar = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesGkPolar");
    }
    int maxThreads = cu.getNonbondedUtilities().getForceThreadBlockSize();
    fixedFieldThreads = min(maxThreads, cu.computeThreadBlockSize(fixedThreadMemory));
@@ -1102,9 +1125,12 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
    if (maxInducedIterations > 0) {
        defines["THREAD_BLOCK_SIZE"] = cu.intToString(inducedFieldThreads);
+        defines["MAX_PREV_DIIS_DIPOLES"] = cu.intToString(MaxPrevDIISDipoles);
        module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
        computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
-        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldBySOR");
+        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
+        recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS");
+        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
    }
    stringstream electrostaticsSource;
    if (usePME) {
@@ -1421,7 +1447,6 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Iterate until the dipoles converge.
-        vector<float2> errors;
        for (int i = 0; i < maxInducedIterations; i++) {
            cu.clearBuffer(*inducedField);
            cu.clearBuffer(*inducedFieldPolar);
@@ -1440,23 +1465,9 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
                    &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
                    &gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()};
                cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
-                void* updateInducedGkFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
-                    &gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(),
-                    &gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(),
-                    &gkKernel->getInducedDipolesPolar()->getDevicePointer(), &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
-                cu.executeKernel(updateInducedFieldKernel, updateInducedGkFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
-            }
-            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &npt, &inducedField->getDevicePointer(),
-                &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
-                &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
-            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
-            inducedDipoleErrors->download(errors);
-            double total1 = 0.0, total2 = 0.0;
-            for (int j = 0; j < (int) errors.size(); j++) {
-                total1 += errors[j].x;
-                total2 += errors[j].y;
            }
-            if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
+            double maxEpsilon = iterateDipolesByDIIS(i);
+            if (maxEpsilon < inducedEpsilon)
                break;
        }
@@ -1568,17 +1579,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
                &inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
            cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
-            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &npt, &inducedField->getDevicePointer(),
+            double maxEpsilon = iterateDipolesByDIIS(i);
-                &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
+            if (maxEpsilon < inducedEpsilon)
-                &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
-            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
-            inducedDipoleErrors->download(errors);
-            double total1 = 0.0, total2 = 0.0;
-            for (int j = 0; j < (int) errors.size(); j++) {
-                total1 += errors[j].x;
-                total2 += errors[j].y;
-            }
-            if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
                break;
        }
@@ -1612,6 +1614,88 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
    return 0.0;
 }
+double CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
+    void* npt = NULL;
+    bool trueValue = true, falseValue = false;
+    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    // Record the dipole and errors into the lists of previous dipoles.
+    if (gkKernel != NULL) {
+        void* recordDIISDipolesGkArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(),
+            &gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), 
+            &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer(), &prevDipolesGk->getDevicePointer(),
+            &prevDipolesGkPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &falseValue, &diisMatrix->getDevicePointer()};
+        cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
+    }
+    void* recordDIISDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &npt, &inducedField->getDevicePointer(),
+        &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
+        &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer(), &prevDipoles->getDevicePointer(),
+        &prevDipolesPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &trueValue, &diisMatrix->getDevicePointer()};
+    cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
+    float2* errors = (float2*) cu.getPinnedBuffer();
+    inducedDipoleErrors->download(errors, false);
+    // Determine the coefficients for selecting the new dipoles.
+    int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles);
+    void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
+    int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
+    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*128, 128, 128*elementSize);
+    vector<float> coefficients(MaxPrevDIISDipoles);
+    if (iteration == 0)
+        coefficients[0] = 1;
+    else {
+        vector<float> matrix;
+        diisMatrix->download(matrix);
+        int rank = numPrev+1;
+        Array2D<double> b(rank, rank);
+        b[0][0] = 0;
+        for (int i = 1; i < rank; i++)
+            b[i][0] = b[0][i] = -1;
+        for (int i = 0; i < numPrev; i++)
+            for (int j = 0; j < numPrev; j++)
+                b[i+1][j+1] = matrix[i*MaxPrevDIISDipoles+j];
+        // Solve using SVD.  Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.
+        JAMA::SVD<double> svd(b);
+        Array2D<double> u, v;
+        svd.getU(u);
+        svd.getV(v);
+        Array1D<double> s;
+        svd.getSingularValues(s);
+        int effectiveRank = svd.rank();
+        for (int i = 1; i < rank; i++) {
+            double d = 0;
+            for (int j = 0; j < effectiveRank; j++)
+                d -= u[0][j]*v[i][j]/s[j];
+            coefficients[i-1] = d;
+        }
+    }
+    diisCoefficients->upload(&coefficients[0]);
+    // Compute the dipoles.
+    void* updateInducedFieldArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
+        &prevDipoles->getDevicePointer(), &prevDipolesPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
+    cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+    if (gkKernel != NULL) {
+        void* updateInducedFieldGkArgs[] = {&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
+            &prevDipolesGk->getDevicePointer(), &prevDipolesGkPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
+        cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+    }
+    // Compute the overall error for monitoring convergence.
+    double total1 = 0.0, total2 = 0.0;
+    for (int j = 0; j < inducedDipoleErrors->getSize(); j++) {
+        total1 += errors[j].x;
+        total2 += errors[j].y;
+    }
+    return 48.033324*sqrt(max(total1, total2)/cu.getNumAtoms());
+}
 void CudaCalcAmoebaMultipoleForceKernel::ensureMultipolesValid(ContextImpl& context) {
    if (multipolesAreValid) {
        int numParticles = cu.getNumAtoms();

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -375,6 +375,7 @@ private:
        const char* getSortKey() const {return "value.y";}
    };
    void initializeScaleFactors();
+    double iterateDipolesByDIIS(int iteration);
    void ensureMultipolesValid(ContextImpl& context);
    template <class T, class T4, class M4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments);
    int numMultipoles, maxInducedIterations;
@@ -399,6 +400,13 @@ private:
    CudaArray* inducedDipole;
    CudaArray* inducedDipolePolar;
    CudaArray* inducedDipoleErrors;
+    CudaArray* prevDipoles;
+    CudaArray* prevDipolesPolar;
+    CudaArray* prevDipolesGk;
+    CudaArray* prevDipolesGkPolar;
+    CudaArray* prevErrors;
+    CudaArray* diisMatrix;
+    CudaArray* diisCoefficients;
    CudaArray* polarizability;
    CudaArray* covalentFlags;
    CudaArray* polarizationGroupFlags;
@@ -419,8 +427,10 @@ private:
    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
    CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
+    CUfunction recordDIISDipolesKernel, buildMatrixKernel;
    CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
    static const int PmeOrder = 5;
+    static const int MaxPrevDIISDipoles = 20;
 };
 /**

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
@@ -485,7 +485,7 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
    buffer[threadIdx.x] = make_real2(sumErrors, sumPolarErrors);
    __syncthreads();
-    for (int offset = 1; offset < blockDim.x; offset *= 2) {   
+    for (int offset = 1; offset < blockDim.x; offset *= 2) {
        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0) {
            buffer[threadIdx.x].x += buffer[threadIdx.x+offset].x;
            buffer[threadIdx.x].y += buffer[threadIdx.x+offset].y;
@@ -494,4 +494,115 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
    }
    if (threadIdx.x == 0)
        errors[blockIdx.x] = make_float2((float) buffer[0].x, (float) buffer[0].y);
 }
\ No newline at end of file
+extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar,
+        const long long* __restrict__ fixedFieldS, const long long* __restrict__ inducedField, const long long* __restrict__ inducedFieldPolar,
+        const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors,
+        real* __restrict__ prevDipoles, real* __restrict__ prevDipolesPolar, real* __restrict__ prevErrors, int iteration, bool recordPrevErrors, real* __restrict__ matrix) {
+    extern __shared__ real2 buffer[];
+#ifdef USE_EWALD
+    const real ewaldScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
+#else
+    const real ewaldScale = 0;
+#endif
+    const real fieldScale = 1/(real) 0x100000000;
+    real sumErrors = 0;
+    real sumPolarErrors = 0;
+    for (int atom = blockIdx.x*blockDim.x + threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        real scale = polarizability[atom];
+        for (int component = 0; component < 3; component++) {
+            int dipoleIndex = 3*atom+component;
+            int fieldIndex = atom+component*PADDED_NUM_ATOMS;
+            if (iteration >= MAX_PREV_DIIS_DIPOLES) {
+                // We have filled up the buffer for previous dipoles, so shift them all over by one.
+                for (int i = 1; i < MAX_PREV_DIIS_DIPOLES; i++) {
+                    int index1 = dipoleIndex+(i-1)*NUM_ATOMS*3;
+                    int index2 = dipoleIndex+i*NUM_ATOMS*3;
+                    prevDipoles[index1] = prevDipoles[index2];
+                    prevDipolesPolar[index1] = prevDipolesPolar[index2];
+                    if (recordPrevErrors)
+                        prevErrors[index1] = prevErrors[index2];
+                }
+            }
+            // Compute the new dipole, and record it along with the error.
+            real oldDipole = inducedDipole[dipoleIndex];
+            real oldDipolePolar = inducedDipolePolar[dipoleIndex];
+            long long fixedS = (fixedFieldS == NULL ? (long long) 0 : fixedFieldS[fieldIndex]);
+            real newDipole = scale*((fixedField[fieldIndex]+fixedS+inducedField[fieldIndex])*fieldScale+ewaldScale*oldDipole);
+            real newDipolePolar = scale*((fixedFieldPolar[fieldIndex]+fixedS+inducedFieldPolar[fieldIndex])*fieldScale+ewaldScale*oldDipolePolar);
+            int storePrevIndex = dipoleIndex+min(iteration, MAX_PREV_DIIS_DIPOLES-1)*NUM_ATOMS*3;
+            prevDipoles[storePrevIndex] = newDipole;
+            prevDipolesPolar[storePrevIndex] = newDipolePolar;
+            if (recordPrevErrors)
+                prevErrors[storePrevIndex] = newDipole-oldDipole;
+            sumErrors += (newDipole-oldDipole)*(newDipole-oldDipole);
+            sumPolarErrors += (newDipolePolar-oldDipolePolar)*(newDipolePolar-oldDipolePolar);
+        }
+    }
+    // Sum the errors over threads and store the total for this block.
+    buffer[threadIdx.x] = make_real2(sumErrors, sumPolarErrors);
+    __syncthreads();
+    for (int offset = 1; offset < blockDim.x; offset *= 2) {
+        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0) {
+            buffer[threadIdx.x].x += buffer[threadIdx.x+offset].x;
+            buffer[threadIdx.x].y += buffer[threadIdx.x+offset].y;
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        errors[blockIdx.x] = make_float2((float) buffer[0].x, (float) buffer[0].y);
+    if (iteration >= MAX_PREV_DIIS_DIPOLES && recordPrevErrors && blockIdx.x == 0) {
+        // Shift over the existing matrix elements.
+        for (int i = 0; i < MAX_PREV_DIIS_DIPOLES-1; i++) {
+            if (threadIdx.x < MAX_PREV_DIIS_DIPOLES-1)
+                matrix[threadIdx.x+i*MAX_PREV_DIIS_DIPOLES] = matrix[(threadIdx.x+1)+(i+1)*MAX_PREV_DIIS_DIPOLES];
+            __syncthreads();
+        }
+    }
+}
+extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int iteration, real* __restrict__ matrix) {
+    extern __shared__ real sumBuffer[];
+    int j = min(iteration, MAX_PREV_DIIS_DIPOLES-1);
+    for (int i = blockIdx.x; i <= j; i += gridDim.x) {
+        // All the threads in this thread block work together to compute a single matrix element.
+        real sum = 0;
+        for (int index = threadIdx.x; index < NUM_ATOMS*3; index += blockDim.x)
+            sum += prevErrors[index+i*NUM_ATOMS*3]*prevErrors[index+j*NUM_ATOMS*3];
+        sumBuffer[threadIdx.x] = sum;
+        __syncthreads();
+        for (int offset = 1; offset < blockDim.x; offset *= 2) { 
+            if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
+                sumBuffer[threadIdx.x] += sumBuffer[threadIdx.x+offset];
+            __syncthreads();
+        }
+        if (threadIdx.x == 0) {
+            matrix[i+MAX_PREV_DIIS_DIPOLES*j] = sumBuffer[0];
+            if (i != j)
+                matrix[j+MAX_PREV_DIIS_DIPOLES*i] = sumBuffer[0];
+        }
+    }
+}
+extern "C" __global__ void updateInducedFieldByDIIS(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, 
+        const real* __restrict__ prevDipoles, const real* __restrict__ prevDipolesPolar, const float* __restrict__ coefficients, int numPrev) {
+    for (int index = blockIdx.x*blockDim.x + threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real sum = 0;
+        real sumPolar = 0;
+        for (int i = 0; i < numPrev; i++) {
+            sum += coefficients[i]*prevDipoles[i*3*NUM_ATOMS+index];
+            sumPolar += coefficients[i]*prevDipolesPolar[i*3*NUM_ATOMS+index];
+        }
+        inducedDipole[index] = sum;
+        inducedDipolePolar[index] = sumPolar;
+    }
+}
--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
--- a/plugins/drude/platforms/cuda/src/CudaDrudeKernels.cpp
+++ b/plugins/drude/platforms/cuda/src/CudaDrudeKernels.cpp
@@ -126,9 +126,9 @@ void CudaCalcDrudeForceKernel::initialize(const System& system, const DrudeForce
            double a1 = (atoms[i][2] == -1 ? 1 : aniso12);
            double a2 = (atoms[i][3] == -1 || atoms[i][4] == -1 ? 1 : aniso34);
            double a3 = 3-a1-a2;
-            double k3 = charge*charge/(polarizability*a3);
+            double k3 = ONE_4PI_EPS0*charge*charge/(polarizability*a3);
-            double k1 = charge*charge/(polarizability*a1) - k3;
+            double k1 = ONE_4PI_EPS0*charge*charge/(polarizability*a1) - k3;
-            double k2 = charge*charge/(polarizability*a2) - k3;
+            double k2 = ONE_4PI_EPS0*charge*charge/(polarizability*a2) - k3;
            if (atoms[i][2] == -1) {
                atoms[i][2] = 0;
                k1 = 0;
@@ -197,9 +197,9 @@ void CudaCalcDrudeForceKernel::copyParametersToContext(ContextImpl& context, con
            double a1 = (p2 == -1 ? 1 : aniso12);
            double a2 = (p3 == -1 || p4 == -1 ? 1 : aniso34);
            double a3 = 3-a1-a2;
-            double k3 = charge*charge/(polarizability*a3);
+            double k3 = ONE_4PI_EPS0*charge*charge/(polarizability*a3);
-            double k1 = charge*charge/(polarizability*a1) - k3;
+            double k1 = ONE_4PI_EPS0*charge*charge/(polarizability*a1) - k3;
-            double k2 = charge*charge/(polarizability*a2) - k3;
+            double k2 = ONE_4PI_EPS0*charge*charge/(polarizability*a2) - k3;
            if (p2 == -1)
                k1 = 0;
            if (p3 == -1 || p4 == -1)

--- a/plugins/drude/platforms/cuda/src/kernels/drudePairForce.cu
+++ b/plugins/drude/platforms/cuda/src/kernels/drudePairForce.cu
@@ -13,7 +13,7 @@ real u = drudeParams.x*r;
 real screening = 1-(1+0.5f*u)*EXP(-u);
 real pairEnergy = drudeParams.y*screening*rInv;
 energy += pairEnergy;
-real3 f = delta*(pairEnergy*rInv*rInv);
+real3 f = delta*(drudeParams.y*rInv*rInv*(screening*rInv-0.5f*(1+u)*EXP(-u)*drudeParams.x));
 force1 += f;
 force3 -= f;
@@ -26,7 +26,7 @@ u = drudeParams.x*r;
 screening = 1-(1+0.5f*u)*EXP(-u);
 pairEnergy = -drudeParams.y*screening*rInv;
 energy += pairEnergy;
-f = delta*(pairEnergy*rInv*rInv);
+f = delta*(-drudeParams.y*rInv*rInv*(screening*rInv-0.5f*(1+u)*EXP(-u)*drudeParams.x));
 force1 += f;
 force4 -= f;
@@ -39,7 +39,7 @@ u = drudeParams.x*r;
 screening = 1-(1+0.5f*u)*EXP(-u);
 pairEnergy = -drudeParams.y*screening*rInv;
 energy += pairEnergy;
-f = delta*(pairEnergy*rInv*rInv);
+f = delta*(-drudeParams.y*rInv*rInv*(screening*rInv-0.5f*(1+u)*EXP(-u)*drudeParams.x));
 force2 += f;
 force3 -= f;
@@ -52,6 +52,6 @@ u = drudeParams.x*r;
 screening = 1-(1+0.5f*u)*EXP(-u);
 pairEnergy = drudeParams.y*screening*rInv;
 energy += pairEnergy;
-f = delta*(pairEnergy*rInv*rInv);
+f = delta*(drudeParams.y*rInv*rInv*(screening*rInv-0.5f*(1+u)*EXP(-u)*drudeParams.x));
 force2 += f;
 force4 -= f;