Created HippoNonbondedForce (#2296)

* Created API for HIPPO force field * Beginning of reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Completed reference of HIPPO with no cutoff * Beginning cutoffs/PME for reference implementation of HIPPO * Continuing PME for reference implementation of HIPPO * Continuing PME for reference implementation of HIPPO * Continuing PME for reference implementation of HIPPO * Completed reference implementation of HIPPO * Cleanup and optimization to HIPPO reference * Further cleanup to HIPPO * Combined direct space interactions into a single loop * Compute direct space interactions in quasi-internal frame * Beginning of CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Finished CUDA implementation of HIPPO * More features and test cases for HippoNonbondedForce * Serialization and Python API for HippoNonbondedForce * Fixed sign error in computing forces

Created HippoNonbondedForce (#2296)
* Created API for HIPPO force field * Beginning of reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Continuing reference implementation of HIPPO * Completed reference of HIPPO with no cutoff * Beginning cutoffs/PME for reference implementation of HIPPO * Continuing PME for reference implementation of HIPPO * Continuing PME for reference implementation of HIPPO * Continuing PME for reference implementation of HIPPO * Completed reference implementation of HIPPO * Cleanup and optimization to HIPPO reference * Further cleanup to HIPPO * Combined direct space interactions into a single loop * Compute direct space interactions in quasi-internal frame * Beginning of CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Continuing CUDA implementation of HIPPO * Finished CUDA implementation of HIPPO * More features and test cases for HippoNonbondedForce * Serialization and Python API for HippoNonbondedForce * Fixed sign error in computing forces
1eec1e15 · peastman · GitHub · 0b22cac1 · 1eec1e15 · 1eec1e15
Unverified Commit 1eec1e15 authored Apr 09, 2019 by peastman Committed by GitHub Apr 09, 2019
20 changed files
--- a/openmmapi/include/openmm/Vec3.h
+++ b/openmmapi/include/openmm/Vec3.h
@@ -157,6 +157,10 @@ private:
    double data[3];
 };
+static Vec3 operator*(double lhs, Vec3 rhs) {
+    return Vec3(rhs[0]*lhs, rhs[1]*lhs, rhs[2]*lhs);
+}
 template <class CHAR, class TRAITS>
 std::basic_ostream<CHAR,TRAITS>& operator<<(std::basic_ostream<CHAR,TRAITS>& o, const Vec3& v) {
    o<<'['<<v[0]<<", "<<v[1]<<", "<<v[2]<<']';

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -276,9 +276,9 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
            double recipDispersionEnergy = 0.0;
            pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
            for (int i = 0; i < numberOfAtoms; i++){
-                forces[i][0] -= 2.0*dpmeforces[i][0];
+                forces[i][0] += dpmeforces[i][0];
-                forces[i][1] -= 2.0*dpmeforces[i][1];
+                forces[i][1] += dpmeforces[i][1];
-                forces[i][2] -= 2.0*dpmeforces[i][2];
+                forces[i][2] += dpmeforces[i][2];
            }
            if (totalEnergy)
                *totalEnergy += recipDispersionEnergy;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -694,7 +694,6 @@ private:
    CudaArray pmeDispersionBsplineModuliX;
    CudaArray pmeDispersionBsplineModuliY;
    CudaArray pmeDispersionBsplineModuliZ;
-    CudaArray pmeAtomRange;
    CudaArray pmeAtomGridIndex;
    CudaArray pmeEnergyBuffer;
    CudaSort* sort;

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -277,6 +277,11 @@ public:
     * @param groups    the set of force groups
     */
    void createKernelsForGroups(int groups);
+    /**
+     * Set the source code for the main kernel.  This defaults to the content of nonbonded.cu.  It only needs to be
+     * changed in very unusual circumstances.
+     */
+    void setKernelSource(const std::string& source);
 private:
    class KernelSet;
    class BlockSortTrait;
@@ -311,6 +316,7 @@ private:
    double lastCutoff;
    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
+    std::string kernelSource;
 };
 /**
@@ -343,9 +349,10 @@ public:
     * @param numComponents  the number of components in the parameter
     * @param size           the size of the parameter in bytes
     * @param memory         the memory containing the parameter values
+     * @param constant       whether the memory should be marked as constant
     */
-    ParameterInfo(const std::string& name, const std::string& componentType, int numComponents, int size, CUdeviceptr memory) :
+    ParameterInfo(const std::string& name, const std::string& componentType, int numComponents, int size, CUdeviceptr memory, bool constant=true) :
-            name(name), componentType(componentType), numComponents(numComponents), size(size), memory(memory) {
+            name(name), componentType(componentType), numComponents(numComponents), size(size), memory(memory), constant(constant) {
        if (numComponents == 1)
            type = componentType;
        else {
@@ -372,12 +379,16 @@ public:
    CUdeviceptr& getMemory() {
        return memory;
    }
+    bool isConstant() const {
+        return constant;
+    }
 private:
    std::string name;
    std::string componentType;
    std::string type;
    int size, numComponents;
    CUdeviceptr memory;
+    bool constant;
 };
 } // namespace OpenMM

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1782,6 +1782,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                    pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
                    pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
                    pmeDefines["USE_LJPME"] = "1";
+                    pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
                    double invRCut6 = pow(force.getCutoffDistance(), -6);
                    double dalphaR = dispersionAlpha * force.getCutoffDistance();
                    double dar2 = dalphaR*dalphaR;
@@ -1819,7 +1820,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                    pmeDispersionBsplineModuliY.initialize(cu, dispersionGridSizeY, elementSize, "pmeDispersionBsplineModuliY");
                    pmeDispersionBsplineModuliZ.initialize(cu, dispersionGridSizeZ, elementSize, "pmeDispersionBsplineModuliZ");
                }
-                pmeAtomRange.initialize<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex.initialize<int2>(cu, numParticles, "pmeAtomGridIndex");
                int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
                pmeEnergyBuffer.initialize(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -73,6 +73,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
    CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, 2*sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
    numForceThreadBlocks = 4*multiprocessors;
    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
+    setKernelSource(CudaKernelSources::nonbonded);
 }
 CudaNonbondedUtilities::~CudaNonbondedUtilities() {
@@ -510,13 +511,17 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    replacements["ATOM_PARAMETER_DATA"] = localData.str();
    stringstream args;
    for (int i = 0; i < (int) params.size(); i++) {
-        args << ", const ";
+        args << ", ";
+        if (params[i].isConstant())
+            args << "const ";
        args << params[i].getType();
        args << "* __restrict__ global_";
        args << params[i].getName();
    }
    for (int i = 0; i < (int) arguments.size(); i++) {
-        args << ", const ";
+        args << ", ";
+        if (arguments[i].isConstant())
+            args << "const ";
        args << arguments[i].getType();
        args << "* __restrict__ ";
        args << arguments[i].getName();
@@ -710,7 +715,11 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
-    CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
+    CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(kernelSource, replacements), defines);
    CUfunction kernel = context.getKernel(program, "computeNonbonded");
    return kernel;
 }
+void CudaNonbondedUtilities::setKernelSource(const string& source) {
+    kernelSource = source;
+}
--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -22,7 +22,7 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
 extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
-#ifdef USE_LJPME
+#ifdef CHARGE_FROM_SIGEPS
        , const float2* __restrict__ sigmaEpsilon
 #else
        , const real* __restrict__ charges
@@ -50,7 +50,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        int atom = pmeAtomGridIndex[i].x;
        real4 pos = posq[atom];
-#ifdef USE_LJPME
+#ifdef CHARGE_FROM_SIGEPS
        const float2 sigEps = sigmaEpsilon[atom];
        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
 #else
@@ -275,7 +275,7 @@ extern "C" __global__
 void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
-#ifdef USE_LJPME
+#ifdef CHARGE_FROM_SIGEPS
        , const float2* __restrict__ sigmaEpsilon
 #else
        , const real* __restrict__ charges
@@ -352,7 +352,7 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
                }
            }
        }
-#ifdef USE_LJPME
+#ifdef CHARGE_FROM_SIGEPS
        const float2 sigEps = sigmaEpsilon[atom];
        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
 #else

--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
@@ -252,17 +252,12 @@ void ReferenceLJCoulombIxn::calculateEwaldIxn(int numberOfAtoms, vector<Vec3>& a
            // Dispersion reciprocal space terms
            pme_init(&pmedata,alphaDispersionEwald,numberOfAtoms,dispersionMeshDim,5,1);
-            std::vector<Vec3> dpmeforces;
+            std::vector<Vec3> dpmeforces(numberOfAtoms);
-            for (int i = 0; i < numberOfAtoms; i++){
+            for (int i = 0; i < numberOfAtoms; i++)
                charges[i] = 8.0*pow(atomParameters[i][SigIndex], 3.0) * atomParameters[i][EpsIndex];
-                dpmeforces.push_back(Vec3());
-            }
            pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
-            for (int i = 0; i < numberOfAtoms; i++){
+            for (int i = 0; i < numberOfAtoms; i++)
-                forces[i][0] -= 2.0*dpmeforces[i][0];
+                forces[i] += dpmeforces[i];
-                forces[i][1] -= 2.0*dpmeforces[i][1];
-                forces[i][2] -= 2.0*dpmeforces[i][2];
-            }
            if (totalEnergy)
                *totalEnergy += recipDispersionEnergy;
            pme_destroy(pmedata);

--- a/platforms/reference/src/SimTKReference/ReferencePME.cpp
+++ b/platforms/reference/src/SimTKReference/ReferencePME.cpp
@@ -538,7 +538,7 @@ dpme_reciprocal_convolution(pme_t pme,
    ny = pme->ngrid[1];
    nz = pme->ngrid[2];
-    boxfactor = M_PI*sqrt(M_PI) / (6.0*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
+    boxfactor = -2*M_PI*sqrt(M_PI) / (6.0*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
    esum = 0;
@@ -610,7 +610,7 @@ dpme_reciprocal_convolution(pme_t pme,
        }
    }
    // Remember the C6 energy is attractive, hence the negative sign.
-    *energy = -esum;
+    *energy = 0.5*esum;
 }

--- a/plugins/amoeba/openmmapi/include/OpenMMAmoeba.h
+++ b/plugins/amoeba/openmmapi/include/OpenMMAmoeba.h
@@ -43,5 +43,6 @@
 #include "openmm/AmoebaGeneralizedKirkwoodForce.h"
 #include "openmm/AmoebaVdwForce.h"
 #include "openmm/AmoebaWcaDispersionForce.h"
+#include "openmm/HippoNonbondedForce.h"
 #endif /*AMOEBA_OPENMM_H_*/
--- a/plugins/amoeba/openmmapi/include/openmm/HippoNonbondedForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/HippoNonbondedForce.h
+#ifndef OPENMM_HIPPO_NONBONDED_FORCE_H_
+#define OPENMM_HIPPO_NONBONDED_FORCE_H_
+/* -------------------------------------------------------------------------- *
+ *                              OpenMMAmoeba                                  *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
+ * Authors: Peter Eastman, Mark Friedrichs                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/Force.h"
+#include "internal/windowsExportAmoeba.h"
+#include "openmm/Vec3.h"
+#include <map>
+#include <utility>
+#include <vector>
+namespace OpenMM {
+/**
+ * This class implements all nonbonded interactions in the HIPPO force field: electrostatics,
+ * induction, charge transfer, dispersion, and repulsion.  Although some of these are
+ * conceptually distinct, they share parameters in common and are most efficiently computed
+ * together.  For example, the same multipole definitions are used for both electrostatics
+ * and Pauli repulsion.  Therefore, all of them are computed by a single Force object.
+ *
+ * To use it, create a HippoNonbondedForce object, then call addParticle() once for each particle.  After
+ * an entry has been added, you can modify its force field parameters by calling setParticleParameters().
+ * This will have no effect on Contexts that already exist unless you call updateParametersInContext().
+ * 
+ * You also can specify "exceptions", particular pairs of particles whose interactions should be
+ * reduced or completely omitted.  Call addException() to define exceptions.
+ */
+class OPENMM_EXPORT_AMOEBA HippoNonbondedForce : public Force {
+public:
+    enum NonbondedMethod {
+        /**
+         * No cutoff is applied to nonbonded interactions.  The full set of N^2 interactions is computed exactly.
+         * This necessarily means that periodic boundary conditions cannot be used.  This is the default.
+         */
+        NoCutoff = 0,
+        /**
+         * Periodic boundary conditions are used, and Particle-Mesh Ewald (PME) summation is used to compute the interaction of each particle
+         * with all periodic copies of every other particle.
+         */
+        PME = 1
+    };
+    enum ParticleAxisTypes { ZThenX = 0, Bisector = 1, ZBisect = 2, ThreeFold = 3, ZOnly = 4, NoAxisType = 5, LastAxisTypeIndex = 6 };
+    /**
+     * Create a HippoNonbondedForce.
+     */
+    HippoNonbondedForce();
+    /**
+     * Get the number of particles in the potential function.
+     */
+    int getNumParticles() const {
+        return particles.size();
+    }
+    /**
+     * Get the number of exceptions.
+     */
+    int getNumExceptions() const {
+        return exceptions.size();
+    }
+    /**
+     * Get the method used for handling long-range nonbonded interactions.
+     */
+    NonbondedMethod getNonbondedMethod() const;
+    /**
+     * Set the method used for handling long-range nonbonded interactions.
+     */
+    void setNonbondedMethod(NonbondedMethod method);
+    /**
+     * Get the cutoff distance (in nm) being used for nonbonded interactions.  If the NonbondedMethod in use
+     * is NoCutoff, this value will have no effect.
+     *
+     * @return the cutoff distance, measured in nm
+     */
+    double getCutoffDistance() const;
+    /**
+     * Set the cutoff distance (in nm) being used for nonbonded interactions.  If the NonbondedMethod in use
+     * is NoCutoff, this value will have no effect.
+     *
+     * @param distance    the cutoff distance, measured in nm
+     */
+    void setCutoffDistance(double distance);
+    /**
+     * Get the distance at which the switching function begins to reduce the repulsion and charge transfer interactions.  This must be
+     * less than the cutoff distance.
+     */
+    double getSwitchingDistance() const;
+    /**
+     * Set the distance at which the switching function begins to reduce the repulsion and charge transfer interactions.  This must be
+     * less than the cutoff distance.
+     */
+    void setSwitchingDistance(double distance);
+    /**
+     * Get the coefficients for the mu_0, mu_1, mu_2, ..., mu_n terms in the extrapolation
+     * algorithm for induced dipoles.
+     */
+    const std::vector<double>& getExtrapolationCoefficients() const;
+    /**
+     * Set the coefficients for the mu_0, mu_1, mu_2, ..., mu_n terms in the extrapolation
+     * algorithm for induced dipoles.
+     *
+     * @param coefficients      a vector whose mth entry specifies the coefficient for mu_m.  The length of this
+     *                          vector determines how many iterations are performed.
+     */
+    void setExtrapolationCoefficients(const std::vector<double> &coefficients);
+    /**
+     * Get the parameters to use for PME calculations.  If alpha is 0 (the default), these parameters are
+     * ignored and instead their values are chosen based on the Ewald error tolerance.
+     *
+     * @param[out] alpha   the separation parameter
+     * @param[out] nx      the number of grid points along the X axis
+     * @param[out] ny      the number of grid points along the Y axis
+     * @param[out] nz      the number of grid points along the Z axis
+     */
+    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters to use for dispersion PME calculations.  If alpha is 0 (the default),
+     * these parameters are ignored and instead their values are chosen based on the Ewald error tolerance.
+     *
+     * @param[out] alpha   the separation parameter
+     * @param[out] nx      the number of dispersion grid points along the X axis
+     * @param[out] ny      the number of dispersion grid points along the Y axis
+     * @param[out] nz      the number of dispersion grid points along the Z axis
+     */
+    void getDPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Set the parameters to use for PME calculations.  If alpha is 0 (the default), these parameters are
+     * ignored and instead their values are chosen based on the Ewald error tolerance.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void setPMEParameters(double alpha, int nx, int ny, int nz);
+    /**
+     * Set the parameters to use for dispersion PME calculations.  If alpha is 0 (the default),
+     * these parameters are ignored and instead their values are chosen based on the Ewald error tolerance.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void setDPMEParameters(double alpha, int nx, int ny, int nz);
+    /**
+     * Get the parameters being used for PME in a particular Context.  Because some platforms have restrictions
+     * on the allowed grid sizes, the values that are actually used may be slightly different from those
+     * specified with setPmeGridDimensions(), or the standard values calculated based on the Ewald error tolerance.
+     * See the manual for details.
+     *
+     * @param context      the Context for which to get the parameters
+     * @param[out] alpha   the separation parameter
+     * @param[out] nx      the number of grid points along the X axis
+     * @param[out] ny      the number of grid points along the Y axis
+     * @param[out] nz      the number of grid points along the Z axis
+     */
+    void getPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for dispersion PME in a particular Context.  Because some
+     * platforms have restrictions on the allowed grid sizes, the values that are actually used may be slightly different
+     * from those specified with setPMEParameters(), or the standard values calculated based on the Ewald error tolerance.
+     * See the manual for details.
+     *
+     * @param context      the Context for which to get the parameters
+     * @param[out] alpha   the separation parameter
+     * @param[out] nx      the number of grid points along the X axis
+     * @param[out] ny      the number of grid points along the Y axis
+     * @param[out] nz      the number of grid points along the Z axis
+     */
+    void getDPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Add the nonbonded force parameters for a particle.  This should be called once for each particle
+     * in the System.  When it is called for the i'th time, it specifies the parameters for the i'th particle.
+     *
+     * @param charge            the particle's charge
+     * @param dipole            the particle's molecular dipole (vector of size 3)
+     * @param quadrupole        the particle's molecular quadrupole (vector of size 9)
+     * @param coreCharge        the charge of the atomic core
+     * @param alpha             controls the width of the particle's electron density
+     * @param epsilon           sets the magnitude of charge transfer
+     * @param damping           sets the length scale for charge transfer
+     * @param c6                the coefficient of the dispersion interaction
+     * @param pauliK            the coefficient of the Pauli repulsion interaction
+     * @param pauliQ            the charge used in computing the Pauli repulsion interaction
+     * @param pauliAlpha        the width of the particle's electron density for computing the Pauli repulsion interaction
+     * @param polarizability    atomic polarizability
+     * @param axisType          the particle's axis type
+     * @param multipoleAtomZ    index of first atom used in defining the local coordinate system for multipoles
+     * @param multipoleAtomX    index of second atom used in defining the local coordinate system for multipoles
+     * @param multipoleAtomY    index of third atom used in defining the local coordinate system for multipoles
+     * 
+     * @return the index of the particle that was added
+     */
+    int addParticle(double charge, const std::vector<double>& dipole, const std::vector<double>& quadrupole, double coreCharge,
+                    double alpha, double epsilon, double damping, double c6, double pauliK, double pauliQ, double pauliAlpha,
+                    double polarizability, int axisType, int multipoleAtomZ, int multipoleAtomX, int multipoleAtomY);
+    /**
+     * Get the nonbonded force parameters for a particle.
+     *
+     * @param index             the index of the particle for which to get parameters
+     * @param charge            the particle's charge
+     * @param dipole            the particle's molecular dipole (vector of size 3)
+     * @param quadrupole        the particle's molecular quadrupole (vector of size 9)
+     * @param coreCharge        the charge of the atomic core
+     * @param alpha             controls the width of the particle's electron density
+     * @param epsilon           sets the magnitude of charge transfer
+     * @param damping           sets the length scale for charge transfer
+     * @param c6                the coefficient of the dispersion interaction
+     * @param pauliK            the coefficient of the Pauli repulsion interaction
+     * @param pauliQ            the charge used in computing the Pauli repulsion interaction
+     * @param pauliAlpha        the width of the particle's electron density for computing the Pauli repulsion interaction
+     * @param polarizability    atomic polarizability
+     * @param axisType          the particle's axis type
+     * @param multipoleAtomZ    index of first atom used in defining the local coordinate system for multipoles
+     * @param multipoleAtomX    index of second atom used in defining the local coordinate system for multipoles
+     * @param multipoleAtomY    index of third atom used in defining the local coordinate system for multipoles
+     */
+    void getParticleParameters(int index, double& charge, std::vector<double>& dipole, std::vector<double>& quadrupole, double& coreCharge,
+                               double& alpha, double& epsilon, double& damping, double& c6, double& pauliK, double& pauliQ, double& pauliAlpha,
+                               double& polarizability, int& axisType, int& multipoleAtomZ, int& multipoleAtomX, int& multipoleAtomY) const;
+    /**
+     * Set the nonbonded force parameters for a particle.
+     *
+     * @param index             the index of the particle for which to set parameters
+     * @param charge            the particle's charge
+     * @param dipole            the particle's molecular dipole (vector of size 3)
+     * @param quadrupole        the particle's molecular quadrupole (vector of size 9)
+     * @param coreCharge        the charge of the atomic core
+     * @param alpha             controls the width of the particle's electron density
+     * @param epsilon           sets the magnitude of charge transfer
+     * @param damping           sets the length scale for charge transfer
+     * @param c6                the coefficient of the dispersion interaction
+     * @param pauliK            the coefficient of the Pauli repulsion interaction
+     * @param pauliQ            the charge used in computing the Pauli repulsion interaction
+     * @param pauliAlpha        the width of the particle's electron density for computing the Pauli repulsion interaction
+     * @param polarizability    atomic polarizability
+     * @param axisType          the particle's axis type
+     * @param multipoleAtomZ    index of first atom used in defining the local coordinate system for multipoles
+     * @param multipoleAtomX    index of second atom used in defining the local coordinate system for multipoles
+     * @param multipoleAtomY    index of third atom used in defining the local coordinate system for multipoles
+     */
+    void setParticleParameters(int index, double charge, const std::vector<double>& dipole, const std::vector<double>& quadrupole, double coreCharge,
+                               double alpha, double epsilon, double damping, double c6, double pauliK, double pauliQ, double pauliAlpha,
+                               double polarizability, int axisType, int multipoleAtomZ, int multipoleAtomX, int multipoleAtomY);
+    /**
+     * Add an interaction to the list of exceptions that should be calculated differently from other interactions.
+     * If all scale factors are set to 0, this will cause the interaction to be completely omitted from
+     * force and energy calculations.
+     *
+     * @param particle1                  the index of the first particle involved in the interaction
+     * @param particle2                  the index of the second particle involved in the interaction
+     * @param multipoleMultipoleScale    the factor by which to scale the Coulomb interaction between fixed multipoles
+     * @param dipoleMultipoleScale       the factor by which to scale the Coulomb interaction between an induced dipole and a fixed multipole
+     * @param dipoleDipoleScale          the factor by which to scale the Coulomb interaction between induced dipoles
+     * @param dispersionScale            the factor by which to scale the dispersion interaction
+     * @param repulsionScale             the factor by which to scale the Pauli repulsion
+     * @param replace                    determines the behavior if there is already an exception for the same two particles.  If true, the existing one is replaced.
+     *                                   If false, an exception is thrown.
+     * @return the index of the exception that was added
+     */
+    int addException(int particle1, int particle2, double multipoleMultipoleScale, double dipoleMultipoleScale, double dipoleDipoleScale,
+                     double dispersionScale, double repulsionScale, bool replace = false);
+    /**
+     * Get the scale factors for an interaction that should be calculated differently from others.
+     *
+     * @param index                      the index of the interaction for which to get parameters
+     * @param particle1                  the index of the first particle involved in the interaction
+     * @param particle2                  the index of the second particle involved in the interaction
+     * @param multipoleMultipoleScale    the factor by which to scale the Coulomb interaction between fixed multipoles
+     * @param dipoleMultipoleScale       the factor by which to scale the Coulomb interaction between an induced dipole and a fixed multipole
+     * @param dipoleDipoleScale          the factor by which to scale the Coulomb interaction between induced dipoles
+     * @param dispersionScale            the factor by which to scale the dispersion interaction
+     * @param repulsionScale             the factor by which to scale the Pauli repulsion
+     */
+    void getExceptionParameters(int index, int& particle1, int& particle2, double& multipoleMultipoleScale, double& dipoleMultipoleScale, double& dipoleDipoleScale,
+                                double& dispersionScale, double& repulsionScale) const;
+    /**
+     * Set the scale factors for an interaction that should be calculated differently from others.
+     * If all scale factors are set to 0, this will cause the interaction to be completely omitted from
+     * force and energy calculations.
+     *
+     * @param index                      the index of the interaction for which to set parameters
+     * @param particle1                  the index of the first particle involved in the interaction
+     * @param particle2                  the index of the second particle involved in the interaction
+     * @param multipoleMultipoleScale    the factor by which to scale the Coulomb interaction between fixed multipoles
+     * @param dipoleMultipoleScale       the factor by which to scale the Coulomb interaction between an induced dipole and a fixed multipole
+     * @param dipoleDipoleScale          the factor by which to scale the Coulomb interaction between induced dipoles
+     * @param dispersionScale            the factor by which to scale the dispersion interaction
+     * @param repulsionScale             the factor by which to scale the Pauli repulsion
+     */
+    void setExceptionParameters(int index, int particle1, int particle2, double multipoleMultipoleScale, double dipoleMultipoleScale, double dipoleDipoleScale,
+                                double dispersionScale, double repulsionScale);
+    /**
+     * Get the error tolerance for Ewald summation.  This corresponds to the fractional error in the forces
+     * which is acceptable.  This value is used to select the grid dimensions and separation (alpha)
+     * parameter so that the average error level will be less than the tolerance.  There is not a
+     * rigorous guarantee that all forces on all atoms will be less than the tolerance, however.
+     *
+     * This can be overridden by explicitly setting an alpha parameter and grid dimensions to use.
+     */
+    double getEwaldErrorTolerance() const;
+    /**
+     * Get the error tolerance for Ewald summation.  This corresponds to the fractional error in the forces
+     * which is acceptable.  This value is used to select the grid dimensions and separation (alpha)
+     * parameter so that the average error level will be less than the tolerance.  There is not a
+     * rigorous guarantee that all forces on all atoms will be less than the tolerance, however.
+     *
+     * This can be overridden by explicitly setting an alpha parameter and grid dimensions to use.
+     */
+    void setEwaldErrorTolerance(double tol);
+    /**
+     * Get the fixed dipole moments of all particles in the global reference frame.
+     *
+     * @param context         the Context for which to get the fixed dipoles
+     * @param[out] dipoles    the fixed dipole moment of particle i is stored into the i'th element
+     */
+    void getLabFramePermanentDipoles(Context& context, std::vector<Vec3>& dipoles);
+    /**
+     * Get the induced dipole moments of all particles.
+     *
+     * @param context         the Context for which to get the induced dipoles
+     * @param[out] dipoles    the induced dipole moment of particle i is stored into the i'th element
+     */
+    void getInducedDipoles(Context& context, std::vector<Vec3>& dipoles);
+    /**
+     * Update the particle and exception parameters in a Context to match those stored in this Force object.  This method
+     * provides an efficient method to update certain parameters in an existing Context without needing to reinitialize it.
+     * Simply call setParticleParameters() to modify this object's parameters, then call updateParametersInContext() to
+     * copy them over to the Context.
+     *
+     * This method has several limitations.  The only information it updates is the parameters of particles and exceptions.
+     * All other aspects of the Force (the nonbonded method, the cutoff distance, etc.) are unaffected and can only be
+     * changed by reinitializing the Context.  Furthermore, only the scale factors for an exception  can be changed; the
+     * pair of particles involved in the exception cannot change.  Finally, this method cannot be used to add new
+     * particles or exceptions, only to change the parameters of existing ones.
+     */
+    void updateParametersInContext(Context& context);
+    /**
+     * Returns whether or not this force makes use of periodic boundary
+     * conditions.
+     *
+     * @returns true if nonbondedMethod uses PBC and false otherwise
+     */
+    bool usesPeriodicBoundaryConditions() const {
+        return nonbondedMethod == HippoNonbondedForce::PME;
+    }
+protected:
+    ForceImpl* createImpl() const;
+private:
+    class ParticleInfo;
+    class ExceptionInfo;
+    NonbondedMethod nonbondedMethod;
+    double cutoffDistance, switchingDistance;
+    double ewaldErrorTol;
+    double alpha, dalpha;
+    int nx, ny, nz, dnx, dny, dnz;
+    std::vector<ParticleInfo> particles;
+    std::vector<ExceptionInfo> exceptions;
+    std::map<std::pair<int, int>, int> exceptionMap;
+    std::vector<double> extrapolationCoefficients;
+};
+/**
+ * This is an internal class used to record information about a particle.
+ * @private
+ */
+class HippoNonbondedForce::ParticleInfo {
+public:
+    int axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY;
+    double charge, coreCharge, alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha, polarizability;
+    std::vector<double> dipole, quadrupole;
+    ParticleInfo() {
+        axisType = multipoleAtomZ = multipoleAtomX = multipoleAtomY = -1;
+        charge = coreCharge = alpha = epsilon = damping = c6 = pauliK = pauliQ = pauliAlpha = polarizability = 0.0;
+        dipole.resize(3);
+        quadrupole.resize(9);
+    }
+    ParticleInfo(double charge, const std::vector<double>& dipole, const std::vector<double>& quadrupole, double coreCharge,
+                 double alpha, double epsilon, double damping, double c6, double pauliK, double pauliQ, double pauliAlpha,
+                 double polarizability, int axisType, int multipoleAtomZ, int multipoleAtomX, int multipoleAtomY) :
+        charge(charge), dipole(dipole), quadrupole(quadrupole), coreCharge(coreCharge), alpha(alpha), epsilon(epsilon),
+        damping(damping), c6(c6), pauliK(pauliK), pauliQ(pauliQ), pauliAlpha(pauliAlpha), polarizability(polarizability),
+        axisType(axisType), multipoleAtomZ(multipoleAtomZ), multipoleAtomX(multipoleAtomX), multipoleAtomY(multipoleAtomY) {
+    }
+};
+/**
+ * This is an internal class used to record information about an exception.
+ * @private
+ */
+class HippoNonbondedForce::ExceptionInfo {
+public:
+    int particle1, particle2;
+    double multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale;
+    ExceptionInfo() {
+        particle1 = particle2 = -1;
+        multipoleMultipoleScale = dipoleMultipoleScale = dipoleDipoleScale = dispersionScale = repulsionScale = 0.0;
+    }
+    ExceptionInfo(int particle1, int particle2, double multipoleMultipoleScale, double dipoleMultipoleScale, double dipoleDipoleScale, double dispersionScale, double repulsionScale) :
+        particle1(particle1), particle2(particle2), multipoleMultipoleScale(multipoleMultipoleScale), dipoleMultipoleScale(dipoleMultipoleScale),
+        dipoleDipoleScale(dipoleDipoleScale), dispersionScale(dispersionScale), repulsionScale(repulsionScale) {
+    }
+};
+} // namespace OpenMM
+#endif /*OPENMM_HIPPO_NONBONDED_FORCE_H_*/
--- a/plugins/amoeba/openmmapi/include/openmm/amoebaKernels.h
+++ b/plugins/amoeba/openmmapi/include/openmm/amoebaKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2018 Stanford University and the Authors.      *
 * Authors:                                                                   *
 * Contributors:                                                              *
 *                                                                            *
@@ -495,6 +495,61 @@ public:
    virtual void copyParametersToContext(ContextImpl& context, const AmoebaWcaDispersionForce& force) = 0;
 };
+/**
+ * This kernel is invoked by HippoNonbondedForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CalcHippoNonbondedForceKernel : public KernelImpl {
+public:
+    static std::string Name() {
+        return "CalcHippoNonbondedForce";
+    }
+    CalcHippoNonbondedForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the MultipoleForce this kernel will be used for
+     */
+    virtual void initialize(const System& system, const HippoNonbondedForce& force) = 0;
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    virtual double execute(ContextImpl& context, bool includeForces, bool includeEnergy) = 0;
+    virtual void getLabFramePermanentDipoles(ContextImpl& context, std::vector<Vec3>& dipoles) = 0;
+    virtual void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles) = 0;
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the AmoebaMultipoleForce to copy the parameters from
+     */
+    virtual void copyParametersToContext(ContextImpl& context, const HippoNonbondedForce& force) = 0;
+    /**
+     * Get the parameters being used for PME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    virtual void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
+    /**
+     * Get the parameters being used for dispersion PME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    virtual void getDPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
+};
 } // namespace OpenMM
 #endif /*AMOEBA_OPENMM_KERNELS_H*/
--- a/plugins/amoeba/openmmapi/include/openmm/internal/HippoNonbondedForceImpl.h
+++ b/plugins/amoeba/openmmapi/include/openmm/internal/HippoNonbondedForceImpl.h
+#ifndef OPENMM_HIPPO_NONBONDED_FORCE_IMPL_H_
+#define OPENMM_HIPPO_NONBONDED_FORCE_IMPL_H_
+/* -------------------------------------------------------------------------- *
+ *                                OpenMMAmoeba                                *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2018 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/internal/ForceImpl.h"
+#include "openmm/HippoNonbondedForce.h"
+#include "openmm/Kernel.h"
+#include "openmm/Vec3.h"
+#include <string>
+namespace OpenMM {
+/**
+ * This is the internal implementation of HippoNonbondedForce.
+ */
+class OPENMM_EXPORT_AMOEBA HippoNonbondedForceImpl : public ForceImpl {
+public:
+    HippoNonbondedForceImpl(const HippoNonbondedForce& owner);
+    ~HippoNonbondedForceImpl();
+    void initialize(ContextImpl& context);
+    const HippoNonbondedForce& getOwner() const {
+        return owner;
+    }
+    void updateContextState(ContextImpl& context, bool& forcesInvalid) {
+        // This force field doesn't update the state directly.
+    }
+    double calcForcesAndEnergy(ContextImpl& context, bool includeForces, bool includeEnergy, int groups);
+    std::map<std::string, double> getDefaultParameters() {
+        return std::map<std::string, double>(); // This force doesn't define any parameters.
+    }
+    std::vector<std::string> getKernelNames();
+    void getLabFramePermanentDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
+    void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
+    void updateParametersInContext(ContextImpl& context);
+    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    void getDPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+private:
+    const HippoNonbondedForce& owner;
+    Kernel kernel;
+};
+} // namespace OpenMM
+#endif /*OPENMM_HIPPO_NONBONDED_FORCE_IMPL_H_*/
--- a/plugins/amoeba/openmmapi/src/HippoNonbondedForce.cpp
+++ b/plugins/amoeba/openmmapi/src/HippoNonbondedForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                OpenMMAmoeba                                *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2018 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/Force.h"
+#include "openmm/OpenMMException.h"
+#include "openmm/HippoNonbondedForce.h"
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/internal/HippoNonbondedForceImpl.h"
+#include <sstream>
+using namespace OpenMM;
+using namespace std;
+HippoNonbondedForce::HippoNonbondedForce() : nonbondedMethod(NoCutoff), cutoffDistance(1.0), switchingDistance(0.9),
+        ewaldErrorTol(1e-4), alpha(0.0), dalpha(0.0), nx(0), ny(0), nz(0), dnx(0), dny(0), dnz(0) {
+    extrapolationCoefficients = {0.042, 0.635, 0.414};
+}
+HippoNonbondedForce::NonbondedMethod HippoNonbondedForce::getNonbondedMethod() const {
+    return nonbondedMethod;
+}
+void HippoNonbondedForce::setNonbondedMethod(HippoNonbondedForce::NonbondedMethod method) {
+    if (method < 0 || method > 1)
+        throw OpenMMException("HippoNonbondedForce: Illegal value for nonbonded method");
+    nonbondedMethod = method;
+}
+double HippoNonbondedForce::getCutoffDistance() const {
+    return cutoffDistance;
+}
+void HippoNonbondedForce::setCutoffDistance(double distance) {
+    cutoffDistance = distance;
+}
+double HippoNonbondedForce::getSwitchingDistance() const {
+    return switchingDistance;
+}
+void HippoNonbondedForce::setSwitchingDistance(double distance) {
+    switchingDistance = distance;
+}
+const std::vector<double> & HippoNonbondedForce::getExtrapolationCoefficients() const {
+    return extrapolationCoefficients;
+}
+void HippoNonbondedForce::setExtrapolationCoefficients(const std::vector<double> &coefficients) {
+    extrapolationCoefficients = coefficients;
+}
+void HippoNonbondedForce::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    alpha = this->alpha;
+    nx = this->nx;
+    ny = this->ny;
+    nz = this->nz;
+}
+void HippoNonbondedForce::getDPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    alpha = this->dalpha;
+    nx = this->dnx;
+    ny = this->dny;
+    nz = this->dnz;
+}
+void HippoNonbondedForce::setPMEParameters(double alpha, int nx, int ny, int nz) {
+    this->alpha = alpha;
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+}
+void HippoNonbondedForce::setDPMEParameters(double alpha, int nx, int ny, int nz) {
+    this->dalpha = alpha;
+    this->dnx = nx;
+    this->dny = ny;
+    this->dnz = nz;
+}
+void HippoNonbondedForce::getPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const HippoNonbondedForceImpl&>(getImplInContext(context)).getPMEParameters(alpha, nx, ny, nz);
+}
+void HippoNonbondedForce::getDPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const HippoNonbondedForceImpl&>(getImplInContext(context)).getDPMEParameters(alpha, nx, ny, nz);
+}
+double HippoNonbondedForce::getEwaldErrorTolerance() const {
+    return ewaldErrorTol;
+}
+void HippoNonbondedForce::setEwaldErrorTolerance(double tol) {
+    ewaldErrorTol = tol;
+}
+int HippoNonbondedForce::addParticle(double charge, const std::vector<double>& dipole, const std::vector<double>& quadrupole, double coreCharge,
+                                     double alpha, double epsilon, double damping, double c6, double pauliK, double pauliQ, double pauliAlpha,
+                                     double polarizability, int axisType, int multipoleAtomZ, int multipoleAtomX, int multipoleAtomY) {
+    particles.push_back(ParticleInfo(charge, dipole, quadrupole, coreCharge, alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha,
+                                     polarizability, axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY));
+    return particles.size()-1;
+}
+void HippoNonbondedForce::getParticleParameters(int index, double& charge, std::vector<double>& dipole, std::vector<double>& quadrupole, double& coreCharge,
+                                                double& alpha, double& epsilon, double& damping, double& c6, double& pauliK, double& pauliQ, double& pauliAlpha,
+                                                double& polarizability, int& axisType, int& multipoleAtomZ, int& multipoleAtomX, int& multipoleAtomY) const {
+    charge = particles[index].charge;
+    dipole = particles[index].dipole;
+    quadrupole = particles[index].quadrupole;
+    coreCharge = particles[index].coreCharge;
+    alpha = particles[index].alpha;
+    epsilon = particles[index].epsilon;
+    damping = particles[index].damping;
+    c6 = particles[index].c6;
+    pauliK = particles[index].pauliK;
+    pauliQ = particles[index].pauliQ;
+    pauliAlpha = particles[index].pauliAlpha;
+    polarizability = particles[index].polarizability;
+    axisType = particles[index].axisType;
+    multipoleAtomZ = particles[index].multipoleAtomZ;
+    multipoleAtomX = particles[index].multipoleAtomX;
+    multipoleAtomY = particles[index].multipoleAtomY;
+}
+void HippoNonbondedForce::setParticleParameters(int index, double charge, const std::vector<double>& dipole, const std::vector<double>& quadrupole, double coreCharge,
+                                                double alpha, double epsilon, double damping, double c6, double pauliK, double pauliQ, double pauliAlpha,
+                                                double polarizability, int axisType, int multipoleAtomZ, int multipoleAtomX, int multipoleAtomY) {
+    particles[index].charge = charge;
+    particles[index].dipole = dipole;
+    particles[index].quadrupole = quadrupole;
+    particles[index].coreCharge = coreCharge;
+    particles[index].alpha = alpha;
+    particles[index].epsilon = epsilon;
+    particles[index].damping = damping;
+    particles[index].c6 = c6;
+    particles[index].pauliK = pauliK;
+    particles[index].pauliQ = pauliQ;
+    particles[index].pauliAlpha = pauliAlpha;
+    particles[index].polarizability = polarizability;
+    particles[index].axisType = axisType;
+    particles[index].multipoleAtomZ = multipoleAtomZ;
+    particles[index].multipoleAtomX = multipoleAtomX;
+    particles[index].multipoleAtomY = multipoleAtomY;
+}
+int HippoNonbondedForce::addException(int particle1, int particle2, double multipoleMultipoleScale, double dipoleMultipoleScale, double dipoleDipoleScale,
+        double dispersionScale, double repulsionScale, bool replace) {
+    map<pair<int, int>, int>::iterator iter = exceptionMap.find(pair<int, int>(particle1, particle2));
+    int newIndex;
+    if (iter == exceptionMap.end())
+        iter = exceptionMap.find(pair<int, int>(particle2, particle1));
+    if (iter != exceptionMap.end()) {
+        if (!replace) {
+            stringstream msg;
+            msg << "HippoNonbondedForce: There is already an exception for particles ";
+            msg << particle1;
+            msg << " and ";
+            msg << particle2;
+            throw OpenMMException(msg.str());
+        }
+        exceptions[iter->second] = ExceptionInfo(particle1, particle2, multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale);
+        newIndex = iter->second;
+        exceptionMap.erase(iter->first);
+    }
+    else {
+        exceptions.push_back(ExceptionInfo(particle1, particle2, multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale));
+        newIndex = exceptions.size()-1;
+    }
+    exceptionMap[pair<int, int>(particle1, particle2)] = newIndex;
+    return newIndex;
+}
+void HippoNonbondedForce::getExceptionParameters(int index, int& particle1, int& particle2, double& multipoleMultipoleScale, double& dipoleMultipoleScale, double& dipoleDipoleScale,
+        double& dispersionScale, double& repulsionScale) const {
+    ASSERT_VALID_INDEX(index, exceptions);
+    particle1 = exceptions[index].particle1;
+    particle2 = exceptions[index].particle2;
+    multipoleMultipoleScale = exceptions[index].multipoleMultipoleScale;
+    dipoleMultipoleScale = exceptions[index].dipoleMultipoleScale;
+    dipoleDipoleScale = exceptions[index].dipoleDipoleScale;
+    dispersionScale = exceptions[index].dispersionScale;
+    repulsionScale = exceptions[index].repulsionScale;
+}
+void HippoNonbondedForce::setExceptionParameters(int index, int particle1, int particle2, double multipoleMultipoleScale, double dipoleMultipoleScale, double dipoleDipoleScale,
+        double dispersionScale, double repulsionScale) {
+    ASSERT_VALID_INDEX(index, exceptions);
+    exceptions[index].particle1 = particle1;
+    exceptions[index].particle2 = particle2;
+    exceptions[index].multipoleMultipoleScale = multipoleMultipoleScale;
+    exceptions[index].dipoleMultipoleScale = dipoleMultipoleScale;
+    exceptions[index].dipoleDipoleScale = dipoleDipoleScale;
+    exceptions[index].dispersionScale = dispersionScale;
+    exceptions[index].repulsionScale = repulsionScale;
+}
+void HippoNonbondedForce::getInducedDipoles(Context& context, vector<Vec3>& dipoles) {
+    dynamic_cast<HippoNonbondedForceImpl&>(getImplInContext(context)).getInducedDipoles(getContextImpl(context), dipoles);
+}
+void HippoNonbondedForce::getLabFramePermanentDipoles(Context& context, vector<Vec3>& dipoles) {
+    dynamic_cast<HippoNonbondedForceImpl&>(getImplInContext(context)).getLabFramePermanentDipoles(getContextImpl(context), dipoles);
+}
+ForceImpl* HippoNonbondedForce::createImpl()  const {
+    return new HippoNonbondedForceImpl(*this);
+}
+void HippoNonbondedForce::updateParametersInContext(Context& context) {
+    dynamic_cast<HippoNonbondedForceImpl&>(getImplInContext(context)).updateParametersInContext(getContextImpl(context));
+}
--- a/plugins/amoeba/openmmapi/src/HippoNonbondedForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/HippoNonbondedForceImpl.cpp
+/* -------------------------------------------------------------------------- *
+ *                               OpenMMAmoeba                                 *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2018 Stanford University and the Authors.           *
+ * Authors: Peter Eastman, Mark Friedrichs                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/HippoNonbondedForceImpl.h"
+#include "openmm/amoebaKernels.h"
+using namespace OpenMM;
+using namespace std;
+HippoNonbondedForceImpl::HippoNonbondedForceImpl(const HippoNonbondedForce& owner) : owner(owner) {
+}
+HippoNonbondedForceImpl::~HippoNonbondedForceImpl() {
+}
+void HippoNonbondedForceImpl::initialize(ContextImpl& context) {
+    const System& system = context.getSystem();
+    int numParticles = system.getNumParticles();
+    if (owner.getNumParticles() != numParticles)
+        throw OpenMMException("HippoNonbondedForce must have exactly as many particles as the System it belongs to.");
+    // check cutoff < 0.5*boxSize
+    if (owner.getNonbondedMethod() == HippoNonbondedForce::PME) {
+        Vec3 boxVectors[3];
+        system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        double cutoff = owner.getCutoffDistance();
+        if (cutoff > 0.5*boxVectors[0][0] || cutoff > 0.5*boxVectors[1][1] || cutoff > 0.5*boxVectors[2][2])
+            throw OpenMMException("HippoNonbondedForce: The cutoff distance cannot be greater than half the periodic box size.");
+    }
+    double quadrupoleValidationTolerance = 1.0e-05;
+    for (int i = 0; i < numParticles; i++) {
+        int axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY;
+        double charge, coreCharge, alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha, polarizability;
+        vector<double> dipole, quadrupole;
+        owner.getParticleParameters(i, charge, dipole, quadrupole, coreCharge,
+                               alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha,
+                               polarizability, axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY);
+       // check quadrupole is traceless and symmetric
+       double trace = fabs(quadrupole[0] + quadrupole[4] + quadrupole[8]);
+       if (trace > quadrupoleValidationTolerance) {
+             std::stringstream buffer;
+             buffer << "HippoNonbondedForce: qudarupole for particle=" << i;
+             buffer << " has nonzero trace: " << trace << "; AMOEBA plugin assumes traceless quadrupole.";
+             throw OpenMMException(buffer.str());
+       }
+       if (fabs(quadrupole[1] - quadrupole[3]) > quadrupoleValidationTolerance ) {
+             std::stringstream buffer;
+             buffer << "HippoNonbondedForce: XY and YX components of quadrupole for particle=" << i;
+             buffer << "  are not equal: [" << quadrupole[1] << " " << quadrupole[3] << "];";
+             buffer << " AMOEBA plugin assumes symmetric quadrupole tensor.";
+             throw OpenMMException(buffer.str());
+       }
+       if (fabs(quadrupole[2] - quadrupole[6]) > quadrupoleValidationTolerance ) {
+             std::stringstream buffer;
+             buffer << "HippoNonbondedForce: XZ and ZX components of quadrupole for particle=" << i;
+             buffer << "  are not equal: [" << quadrupole[2] << " " << quadrupole[6] << "];";
+             buffer << " AMOEBA plugin assumes symmetric quadrupole tensor.";
+             throw OpenMMException(buffer.str());
+       }
+       if (fabs(quadrupole[5] - quadrupole[7]) > quadrupoleValidationTolerance ) {
+             std::stringstream buffer;
+             buffer << "HippoNonbondedForce: YZ and ZY components of quadrupole for particle=" << i;
+             buffer << "  are not equal: [" << quadrupole[5] << " " << quadrupole[7] << "];";
+             buffer << " AMOEBA plugin assumes symmetric quadrupole tensor.";
+             throw OpenMMException(buffer.str());
+       }
+       // only 'Z-then-X', 'Bisector', Z-Bisect, ThreeFold  currently handled
+        if (axisType != HippoNonbondedForce::ZThenX     && axisType != HippoNonbondedForce::Bisector &&
+            axisType != HippoNonbondedForce::ZBisect    && axisType != HippoNonbondedForce::ThreeFold &&
+            axisType != HippoNonbondedForce::ZOnly      && axisType != HippoNonbondedForce::NoAxisType) {
+             std::stringstream buffer;
+             buffer << "HippoNonbondedForce: axis type=" << axisType;
+             buffer << " not currently handled - only axisTypes[ ";
+             buffer << HippoNonbondedForce::ZThenX   << ", " << HippoNonbondedForce::Bisector  << ", ";
+             buffer << HippoNonbondedForce::ZBisect  << ", " << HippoNonbondedForce::ThreeFold << ", ";
+             buffer << HippoNonbondedForce::NoAxisType;
+             buffer << "] (ZThenX, Bisector, Z-Bisect, ThreeFold, NoAxisType) currently handled .";
+             throw OpenMMException(buffer.str());
+        }
+        if (axisType != HippoNonbondedForce::NoAxisType && (multipoleAtomZ < 0 || multipoleAtomZ >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "HippoNonbondedForce: invalid z axis particle: " << multipoleAtomZ;
+            throw OpenMMException(buffer.str());
+        }
+        if (axisType != HippoNonbondedForce::NoAxisType && axisType != HippoNonbondedForce::ZOnly &&
+                (multipoleAtomX < 0 || multipoleAtomX >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "HippoNonbondedForce: invalid x axis particle: " << multipoleAtomX;
+            throw OpenMMException(buffer.str());
+        }
+        if ((axisType == HippoNonbondedForce::ZBisect || axisType == HippoNonbondedForce::ThreeFold) &&
+                (multipoleAtomY < 0 || multipoleAtomY >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "HippoNonbondedForce: invalid y axis particle: " << multipoleAtomY;
+            throw OpenMMException(buffer.str());
+        }
+    }
+    kernel = context.getPlatform().createKernel(CalcHippoNonbondedForceKernel::Name(), context);
+    kernel.getAs<CalcHippoNonbondedForceKernel>().initialize(context.getSystem(), owner);
+}
+double HippoNonbondedForceImpl::calcForcesAndEnergy(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+    if ((groups&(1<<owner.getForceGroup())) != 0)
+        return kernel.getAs<CalcHippoNonbondedForceKernel>().execute(context, includeForces, includeEnergy);
+    return 0.0;
+}
+std::vector<std::string> HippoNonbondedForceImpl::getKernelNames() {
+    std::vector<std::string> names;
+    names.push_back(CalcHippoNonbondedForceKernel::Name());
+    return names;
+}
+void HippoNonbondedForceImpl::getLabFramePermanentDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
+    kernel.getAs<CalcHippoNonbondedForceKernel>().getLabFramePermanentDipoles(context, dipoles);
+}
+void HippoNonbondedForceImpl::getInducedDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
+    kernel.getAs<CalcHippoNonbondedForceKernel>().getInducedDipoles(context, dipoles);
+}
+void HippoNonbondedForceImpl::updateParametersInContext(ContextImpl& context) {
+    kernel.getAs<CalcHippoNonbondedForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
+}
+void HippoNonbondedForceImpl::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    kernel.getAs<CalcHippoNonbondedForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+}
+void HippoNonbondedForceImpl::getDPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    kernel.getAs<CalcHippoNonbondedForceKernel>().getDPMEParameters(alpha, nx, ny, nz);
+}
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
 * Authors: Mark Friedrichs, Peter Eastman                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -59,6 +59,7 @@ extern "C" OPENMM_EXPORT void registerKernelFactories() {
        platform.registerKernelFactory(CalcAmoebaGeneralizedKirkwoodForceKernel::Name(), factory);
        platform.registerKernelFactory(CalcAmoebaVdwForceKernel::Name(), factory);
        platform.registerKernelFactory(CalcAmoebaWcaDispersionForceKernel::Name(), factory);
+        platform.registerKernelFactory(CalcHippoNonbondedForceKernel::Name(), factory);
    }
    catch (...) {
        // Ignore.  The CUDA platform isn't available.
@@ -112,5 +113,8 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
    if (name == CalcAmoebaWcaDispersionForceKernel::Name())
        return new CudaCalcAmoebaWcaDispersionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcHippoNonbondedForceKernel::Name())
+        return new CudaCalcHippoNonbondedForceKernel(name, platform, cu, context.getSystem());
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
 }
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -40,7 +40,6 @@
 #include "CudaFFT3D.h"
 #include "CudaForceInfo.h"
 #include "CudaKernelSources.h"
-#include "CudaNonbondedUtilities.h"
 #include "jama_lu.h"
 #include <algorithm>
@@ -766,13 +765,11 @@ private:
 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), hasCreatedEvent(false),
-        sort(NULL), gkKernel(NULL) {
+        gkKernel(NULL) {
 }
 CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
    cu.setAsCurrent();
-    if (sort != NULL)
-        delete sort;
    if (hasInitializedFFT)
        cufftDestroy(fft);
    if (hasCreatedEvent)
@@ -1121,13 +1118,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
        pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
        pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
-        pmeIgrid.initialize<int4>(cu, numMultipoles, "pmeIgrid");
        pmePhi.initialize(cu, 20*numMultipoles, elementSize, "pmePhi");
        pmePhid.initialize(cu, 10*numMultipoles, elementSize, "pmePhid");
        pmePhip.initialize(cu, 10*numMultipoles, elementSize, "pmePhip");
        pmePhidp.initialize(cu, 20*numMultipoles, elementSize, "pmePhidp");
        pmeCphi.initialize(cu, 10*numMultipoles, elementSize, "pmeCphi");
-        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
        cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C);
        if (result != CUFFT_SUCCESS)
            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
@@ -1420,10 +1415,10 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
        void* finishSpreadArgs[] = {&pmeGrid.getDevicePointer()};
-        if (cu.getUseDoublePrecision())
+        if (cu.getUseDoublePrecision()) {
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid.getSize());
-        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
+        }
        else
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
        void* pmeConvolutionArgs[] = {&pmeGrid.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(),
@@ -1466,10 +1461,10 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            &pmeGrid.getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
-        if (cu.getUseDoublePrecision())
+        if (cu.getUseDoublePrecision()) {
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid.getSize());
-        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
+        }
        else
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
@@ -1614,9 +1609,8 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
        if (cu.getUseDoublePrecision()) {
            void* finishSpreadArgs[] = {&pmeGrid.getDevicePointer()};
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid.getSize());
-        }
-        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
+        }
        else
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
        void* pmeConvolutionArgs[] = {&pmeGrid.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(),
@@ -1710,12 +1704,12 @@ void CudaCalcAmoebaMultipoleForceKernel::computeExtrapolatedDipoles(void** recip
    // Start by storing the direct dipoles as PT0
    if (gkKernel == NULL) {
-        void* initArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
+        void* initArgs[] = {&inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
            &extrapolatedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer()};
        cu.executeKernel(initExtrapolatedKernel, initArgs, extrapolatedDipole.getSize());
    }
    else {
-        void* initArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
+        void* initArgs[] = {&inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
            &extrapolatedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
            &gkKernel->getInducedDipoles().getDevicePointer(), &gkKernel->getInducedDipolesPolar().getDevicePointer(), &extrapolatedDipoleGk.getDevicePointer(),
            &extrapolatedDipoleGkPolar.getDevicePointer(), &inducedDipoleFieldGradientGk.getDevicePointer(), &inducedDipoleFieldGradientGkPolar.getDevicePointer()};
@@ -1727,16 +1721,17 @@ void CudaCalcAmoebaMultipoleForceKernel::computeExtrapolatedDipoles(void** recip
    for (int order = 1; order < maxExtrapolationOrder; ++order) {
        computeInducedField(recipBoxVectorPointer);
        if (gkKernel == NULL) {
-            void* iterateArgs[] = {&order, &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
+            void* iterateArgs[] = {&order, &inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer(), &inducedField.getDevicePointer(),
-                &extrapolatedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
+                &inducedDipolePolar.getDevicePointer(), &extrapolatedDipolePolar.getDevicePointer(), &inducedFieldPolar.getDevicePointer(),
-                &inducedField.getDevicePointer(), &inducedFieldPolar.getDevicePointer(), &extrapolatedDipoleFieldGradient.getDevicePointer(), &extrapolatedDipoleFieldGradientPolar.getDevicePointer(),
+                &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
-                &polarizability.getDevicePointer()};
+                &extrapolatedDipoleFieldGradient.getDevicePointer(), &extrapolatedDipoleFieldGradientPolar.getDevicePointer(), &polarizability.getDevicePointer()};
            cu.executeKernel(iterateExtrapolatedKernel, iterateArgs, extrapolatedDipole.getSize());
        }
        else {
-            void* iterateArgs[] = {&order, &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
+            void* iterateArgs[] = {&order, &inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer(), &inducedField.getDevicePointer(),
-                &extrapolatedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
+                &inducedDipolePolar.getDevicePointer(), &extrapolatedDipolePolar.getDevicePointer(), &inducedFieldPolar.getDevicePointer(),
-                &inducedField.getDevicePointer(), &inducedFieldPolar.getDevicePointer(), &extrapolatedDipoleFieldGradient.getDevicePointer(), &extrapolatedDipoleFieldGradientPolar.getDevicePointer(),
+                &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
+                &extrapolatedDipoleFieldGradient.getDevicePointer(), &extrapolatedDipoleFieldGradientPolar.getDevicePointer(),
                &gkKernel->getInducedDipoles().getDevicePointer(), &gkKernel->getInducedDipolesPolar().getDevicePointer(), &extrapolatedDipoleGk.getDevicePointer(),
                &extrapolatedDipoleGkPolar.getDevicePointer(), &inducedDipoleFieldGradientGk.getDevicePointer(), &inducedDipoleFieldGradientGkPolar.getDevicePointer(),
                &gkKernel->getInducedField().getDevicePointer(), &gkKernel->getInducedFieldPolar().getDevicePointer(),
@@ -1749,12 +1744,12 @@ void CudaCalcAmoebaMultipoleForceKernel::computeExtrapolatedDipoles(void** recip
    // Take a linear combination of the µ_(n) components to form the total dipole
    if (gkKernel == NULL) {
-        void* computeArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
+        void* computeArgs[] = {&inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
                &extrapolatedDipolePolar.getDevicePointer()};
        cu.executeKernel(computeExtrapolatedKernel, computeArgs, extrapolatedDipole.getSize());
    }
    else {
-        void* computeArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
+        void* computeArgs[] = {&inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
                &extrapolatedDipolePolar.getDevicePointer(), &gkKernel->getInducedDipoles().getDevicePointer(), &gkKernel->getInducedDipolesPolar().getDevicePointer(),
                &extrapolatedDipoleGk.getDevicePointer(), &extrapolatedDipoleGkPolar.getDevicePointer()};
        cu.executeKernel(computeExtrapolatedKernel, computeArgs, extrapolatedDipole.getSize());
@@ -2593,3 +2588,1153 @@ void CudaCalcAmoebaWcaDispersionForceKernel::copyParametersToContext(ContextImpl
    totalMaximumDispersionEnergy = AmoebaWcaDispersionForceImpl::getTotalMaximumDispersionEnergy(force);
    cu.invalidateMolecules();
 }
+/* -------------------------------------------------------------------------- *
+ *                           HippoNonbondedForce                              *
+ * -------------------------------------------------------------------------- */
+class CudaCalcHippoNonbondedForceKernel::ForceInfo : public CudaForceInfo {
+public:
+    ForceInfo(const HippoNonbondedForce& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        double charge1, coreCharge1, alpha1, epsilon1, damping1, c61, pauliK1, pauliQ1, pauliAlpha1, polarizability1;
+        double charge2, coreCharge2, alpha2, epsilon2, damping2, c62, pauliK2, pauliQ2, pauliAlpha2, polarizability2;
+        int axisType1, multipoleZ1, multipoleX1, multipoleY1;
+        int axisType2, multipoleZ2, multipoleX2, multipoleY2;
+        vector<double> dipole1, dipole2, quadrupole1, quadrupole2;
+        force.getParticleParameters(particle1, charge1, dipole1, quadrupole1, coreCharge1, alpha1, epsilon1, damping1, c61, pauliK1, pauliQ1, pauliAlpha1,
+                                    polarizability1, axisType1, multipoleZ1, multipoleX1, multipoleY1);
+        force.getParticleParameters(particle2, charge2, dipole2, quadrupole2, coreCharge2, alpha2, epsilon2, damping2, c62, pauliK2, pauliQ2, pauliAlpha2,
+                                    polarizability2, axisType2, multipoleZ2, multipoleX2, multipoleY2);
+        if (charge1 != charge2 || coreCharge1 != coreCharge2 || alpha1 != alpha2 || epsilon1 != epsilon1 || damping1 != damping2 || c61 != c62 ||
+                pauliK1 != pauliK2 || pauliQ1 != pauliQ2 || pauliAlpha1 != pauliAlpha2 || polarizability1 != polarizability2 || axisType1 != axisType2) {
+            return false;
+        }
+        for (int i = 0; i < dipole1.size(); ++i)
+            if (dipole1[i] != dipole2[i])
+                return false;
+        for (int i = 0; i < quadrupole1.size(); ++i)
+            if (quadrupole1[i] != quadrupole2[i])
+                return false;
+        return true;
+    }
+    int getNumParticleGroups() {
+        return force.getNumExceptions();
+    }
+    void getParticlesInGroup(int index, vector<int>& particles) {
+        int particle1, particle2;
+        double multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale;
+        force.getExceptionParameters(index, particle1, particle2, multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale);
+        particles.resize(2);
+        particles[0] = particle1;
+        particles[1] = particle2;
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        int particle1, particle2;
+        double multipoleMultipoleScale1, dipoleMultipoleScale1, dipoleDipoleScale1, dispersionScale1, repulsionScale1;
+        double multipoleMultipoleScale2, dipoleMultipoleScale2, dipoleDipoleScale2, dispersionScale2, repulsionScale2;
+        force.getExceptionParameters(group1, particle1, particle2, multipoleMultipoleScale1, dipoleMultipoleScale1, dipoleDipoleScale1, dispersionScale1, repulsionScale1);
+        force.getExceptionParameters(group2, particle1, particle2, multipoleMultipoleScale2, dipoleMultipoleScale2, dipoleDipoleScale2, dispersionScale2, repulsionScale2);
+        return (multipoleMultipoleScale1 == multipoleMultipoleScale2 && dipoleMultipoleScale1 == dipoleMultipoleScale2 &&
+                dipoleDipoleScale1 == dipoleDipoleScale2 && dispersionScale1 == dispersionScale2 && repulsionScale1 == repulsionScale2);
+    }
+private:
+    const HippoNonbondedForce& force;
+};
+class CudaCalcHippoNonbondedForceKernel::TorquePostComputation : public CudaContext::ForcePostComputation {
+public:
+    TorquePostComputation(CudaCalcHippoNonbondedForceKernel& owner) : owner(owner) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        owner.addTorquesToForces();
+        return 0.0;
+    }
+private:
+    CudaCalcHippoNonbondedForceKernel& owner;
+};
+CudaCalcHippoNonbondedForceKernel::CudaCalcHippoNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
+        CalcHippoNonbondedForceKernel(name, platform), cu(cu), system(system), sort(NULL), hasInitializedKernels(false), hasInitializedFFT(false), multipolesAreValid(false) {
+}
+CudaCalcHippoNonbondedForceKernel::~CudaCalcHippoNonbondedForceKernel() {
+    cu.setAsCurrent();
+    if (sort != NULL)
+        delete sort;
+    if (hasInitializedFFT) {
+        cufftDestroy(fftForward);
+        cufftDestroy(fftBackward);
+        cufftDestroy(dfftForward);
+        cufftDestroy(dfftBackward);
+    }
+}
+void CudaCalcHippoNonbondedForceKernel::initialize(const System& system, const HippoNonbondedForce& force) {
+    cu.setAsCurrent();
+    extrapolationCoefficients = force.getExtrapolationCoefficients();
+    usePME = (force.getNonbondedMethod() == HippoNonbondedForce::PME);
+    // Initialize particle parameters.
+    numParticles = force.getNumParticles();
+    vector<double> coreChargeVec, valenceChargeVec, alphaVec, epsilonVec, dampingVec, c6Vec, pauliKVec, pauliQVec, pauliAlphaVec, polarizabilityVec;
+    vector<double> localDipolesVec, localQuadrupolesVec;
+    vector<int4> multipoleParticlesVec;
+    vector<vector<int> > exclusions(numParticles);
+    for (int i = 0; i < numParticles; i++) {
+        double charge, coreCharge, alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha, polarizability;
+        int axisType, atomX, atomY, atomZ;
+        vector<double> dipole, quadrupole;
+        force.getParticleParameters(i, charge, dipole, quadrupole, coreCharge, alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha,
+                                    polarizability, axisType, atomZ, atomX, atomY);
+        coreChargeVec.push_back(coreCharge);
+        valenceChargeVec.push_back(charge-coreCharge);
+        alphaVec.push_back(alpha);
+        epsilonVec.push_back(epsilon);
+        dampingVec.push_back(damping);
+        c6Vec.push_back(c6);
+        pauliKVec.push_back(pauliK);
+        pauliQVec.push_back(pauliQ);
+        pauliAlphaVec.push_back(pauliAlpha);
+        polarizabilityVec.push_back(polarizability);
+        multipoleParticlesVec.push_back(make_int4(atomX, atomY, atomZ, axisType));
+        for (int j = 0; j < 3; j++)
+            localDipolesVec.push_back(dipole[j]);
+        localQuadrupolesVec.push_back(quadrupole[0]);
+        localQuadrupolesVec.push_back(quadrupole[1]);
+        localQuadrupolesVec.push_back(quadrupole[2]);
+        localQuadrupolesVec.push_back(quadrupole[4]);
+        localQuadrupolesVec.push_back(quadrupole[5]);
+        exclusions[i].push_back(i);
+    }
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
+    for (int i = numParticles; i < paddedNumAtoms; i++) {
+        coreChargeVec.push_back(0);
+        valenceChargeVec.push_back(0);
+        alphaVec.push_back(0);
+        epsilonVec.push_back(0);
+        dampingVec.push_back(0);
+        c6Vec.push_back(0);
+        pauliKVec.push_back(0);
+        pauliQVec.push_back(0);
+        pauliAlphaVec.push_back(0);
+        polarizabilityVec.push_back(0);
+        multipoleParticlesVec.push_back(make_int4(0, 0, 0, 0));
+        for (int j = 0; j < 3; j++)
+            localDipolesVec.push_back(0);
+        for (int j = 0; j < 5; j++)
+            localQuadrupolesVec.push_back(0);
+    }
+    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    coreCharge.initialize(cu, paddedNumAtoms, elementSize, "coreCharge");
+    valenceCharge.initialize(cu, paddedNumAtoms, elementSize, "valenceCharge");
+    alpha.initialize(cu, paddedNumAtoms, elementSize, "alpha");
+    epsilon.initialize(cu, paddedNumAtoms, elementSize, "epsilon");
+    damping.initialize(cu, paddedNumAtoms, elementSize, "damping");
+    c6.initialize(cu, paddedNumAtoms, elementSize, "c6");
+    pauliK.initialize(cu, paddedNumAtoms, elementSize, "pauliK");
+    pauliQ.initialize(cu, paddedNumAtoms, elementSize, "pauliQ");
+    pauliAlpha.initialize(cu, paddedNumAtoms, elementSize, "pauliAlpha");
+    polarizability.initialize(cu, paddedNumAtoms, elementSize, "polarizability");
+    multipoleParticles.initialize<int4>(cu, paddedNumAtoms, "multipoleParticles");
+    localDipoles.initialize(cu, 3*paddedNumAtoms, elementSize, "localDipoles");
+    localQuadrupoles.initialize(cu, 5*paddedNumAtoms, elementSize, "localQuadrupoles");
+    lastPositions.initialize(cu, cu.getPosq().getSize(), cu.getPosq().getElementSize(), "lastPositions");
+    coreCharge.upload(coreChargeVec, true);
+    valenceCharge.upload(valenceChargeVec, true);
+    alpha.upload(alphaVec, true);
+    epsilon.upload(epsilonVec, true);
+    damping.upload(dampingVec, true);
+    c6.upload(c6Vec, true);
+    pauliK.upload(pauliKVec, true);
+    pauliQ.upload(pauliQVec, true);
+    pauliAlpha.upload(pauliAlphaVec, true);
+    polarizability.upload(polarizabilityVec, true);
+    multipoleParticles.upload(multipoleParticlesVec);
+    localDipoles.upload(localDipolesVec, true);
+    localQuadrupoles.upload(localQuadrupolesVec, true);
+    // Create workspace arrays.
+    labDipoles.initialize(cu, paddedNumAtoms, 3*elementSize, "dipole");
+    labQuadrupoles[0].initialize(cu, paddedNumAtoms, elementSize, "qXX");
+    labQuadrupoles[1].initialize(cu, paddedNumAtoms, elementSize, "qXY");
+    labQuadrupoles[2].initialize(cu, paddedNumAtoms, elementSize, "qXZ");
+    labQuadrupoles[3].initialize(cu, paddedNumAtoms, elementSize, "qYY");
+    labQuadrupoles[4].initialize(cu, paddedNumAtoms, elementSize, "qYZ");
+    fracDipoles.initialize(cu, paddedNumAtoms, 3*elementSize, "fracDipoles");
+    fracQuadrupoles.initialize(cu, 6*paddedNumAtoms, elementSize, "fracQuadrupoles");
+    field.initialize(cu, 3*paddedNumAtoms, sizeof(long long), "field");
+    inducedField.initialize(cu, 3*paddedNumAtoms, sizeof(long long), "inducedField");
+    torque.initialize(cu, 3*paddedNumAtoms, sizeof(long long), "torque");
+    inducedDipole.initialize(cu, paddedNumAtoms, 3*elementSize, "inducedDipole");
+    int numOrders = extrapolationCoefficients.size();
+    extrapolatedDipole.initialize(cu, 3*numParticles*numOrders, elementSize, "extrapolatedDipole");
+    extrapolatedPhi.initialize(cu, 10*numParticles*numOrders, elementSize, "extrapolatedPhi");
+    cu.addAutoclearBuffer(field);
+    cu.addAutoclearBuffer(torque);
+    // Record exceptions and exclusions.
+    vector<double> exceptionScaleVec[5];
+    vector<int2> exceptionAtomsVec;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale;
+        force.getExceptionParameters(i, particle1, particle2, multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale);
+        exclusions[particle1].push_back(particle2);
+        exclusions[particle2].push_back(particle1);
+        if (usePME || multipoleMultipoleScale != 0 || dipoleMultipoleScale != 0 || dipoleDipoleScale != 0 || dispersionScale != 0 || repulsionScale != 0) {
+            exceptionAtomsVec.push_back(make_int2(particle1, particle2));
+            exceptionScaleVec[0].push_back(multipoleMultipoleScale);
+            exceptionScaleVec[1].push_back(dipoleMultipoleScale);
+            exceptionScaleVec[2].push_back(dipoleDipoleScale);
+            exceptionScaleVec[3].push_back(dispersionScale);
+            exceptionScaleVec[4].push_back(repulsionScale);
+        }
+    }
+    if (exceptionAtomsVec.size() > 0) {
+        exceptionAtoms.initialize<int2>(cu, exceptionAtomsVec.size(), "exceptionAtoms");
+        exceptionAtoms.upload(exceptionAtomsVec);
+        for (int i = 0; i < 5; i++) {
+            exceptionScales[i].initialize(cu, exceptionAtomsVec.size(), elementSize, "exceptionScales");
+            exceptionScales[i].upload(exceptionScaleVec[i], true);
+        }
+    }
+    // Create the kernels.
+    bool useShuffle = (cu.getComputeCapability() >= 3.0 && !cu.getUseDoublePrecision());
+    map<string, string> defines;
+    defines["HIPPO"] = "1";
+    defines["NUM_ATOMS"] = cu.intToString(numParticles);
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+    defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456);
+    if (useShuffle)
+        defines["USE_SHUFFLE"] = "";
+    maxExtrapolationOrder = extrapolationCoefficients.size();
+    defines["MAX_EXTRAPOLATION_ORDER"] = cu.intToString(maxExtrapolationOrder);
+    stringstream coefficients;
+    for (int i = 0; i < maxExtrapolationOrder; i++) {
+        if (i > 0)
+            coefficients << ",";
+        double sum = 0;
+        for (int j = i; j < maxExtrapolationOrder; j++)
+            sum += extrapolationCoefficients[j];
+        coefficients << cu.doubleToString(sum);
+    }
+    defines["EXTRAPOLATION_COEFFICIENTS_SUM"] = coefficients.str();
+    cutoff = force.getCutoffDistance();
+    if (usePME) {
+        int nx, ny, nz;
+        force.getPMEParameters(pmeAlpha, nx, ny, nz);
+        if (nx == 0 || pmeAlpha == 0) {
+            NonbondedForce nb;
+            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
+            nb.setCutoffDistance(force.getCutoffDistance());
+            NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false);
+            gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
+            gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
+            gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
+        } else {
+            gridSizeX = CudaFFT3D::findLegalDimension(nx);
+            gridSizeY = CudaFFT3D::findLegalDimension(ny);
+            gridSizeZ = CudaFFT3D::findLegalDimension(nz);
+        }
+        force.getDPMEParameters(dpmeAlpha, nx, ny, nz);
+        if (nx == 0 || dpmeAlpha == 0) {
+            NonbondedForce nb;
+            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
+            nb.setCutoffDistance(force.getCutoffDistance());
+            NonbondedForceImpl::calcPMEParameters(system, nb, dpmeAlpha, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
+            dispersionGridSizeX = CudaFFT3D::findLegalDimension(dispersionGridSizeX);
+            dispersionGridSizeY = CudaFFT3D::findLegalDimension(dispersionGridSizeY);
+            dispersionGridSizeZ = CudaFFT3D::findLegalDimension(dispersionGridSizeZ);
+        } else {
+            dispersionGridSizeX = CudaFFT3D::findLegalDimension(nx);
+            dispersionGridSizeY = CudaFFT3D::findLegalDimension(ny);
+            dispersionGridSizeZ = CudaFFT3D::findLegalDimension(nz);
+        }
+        defines["EWALD_ALPHA"] = cu.doubleToString(pmeAlpha);
+        defines["SQRT_PI"] = cu.doubleToString(sqrt(M_PI));
+        defines["USE_EWALD"] = "";
+        defines["USE_CUTOFF"] = "";
+        defines["USE_PERIODIC"] = "";
+        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+    }
+    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::hippoMultipoles, defines);
+    computeMomentsKernel = cu.getKernel(module, "computeLabFrameMoments");
+    recordInducedDipolesKernel = cu.getKernel(module, "recordInducedDipoles");
+    mapTorqueKernel = cu.getKernel(module, "mapTorqueToForce");
+    module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
+    initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
+    iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
+    computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
+    polarizationEnergyKernel = cu.getKernel(module, "computePolarizationEnergy");
+    // Set up PME.
+    if (usePME) {
+        // Create the PME kernels.
+        map<string, string> pmeDefines;
+        pmeDefines["HIPPO"] = "1";
+        pmeDefines["EWALD_ALPHA"] = cu.doubleToString(pmeAlpha);
+        pmeDefines["DISPERSION_EWALD_ALPHA"] = cu.doubleToString(dpmeAlpha);
+        pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
+        pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
+        pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+        pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(138.9354558456);
+        pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
+        pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
+        pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
+        pmeDefines["M_PI"] = cu.doubleToString(M_PI);
+        pmeDefines["SQRT_PI"] = cu.doubleToString(sqrt(M_PI));
+        pmeDefines["EXTRAPOLATION_COEFFICIENTS_SUM"] = coefficients.str();
+        pmeDefines["MAX_EXTRAPOLATION_ORDER"] = cu.intToString(maxExtrapolationOrder);
+        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
+        pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
+        pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates");
+        pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
+        pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles");
+        pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+        pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+        pmeFixedPotentialKernel = cu.getKernel(module, "computeFixedPotentialFromGrid");
+        pmeInducedPotentialKernel = cu.getKernel(module, "computeInducedPotentialFromGrid");
+        pmeFixedForceKernel = cu.getKernel(module, "computeFixedMultipoleForceAndEnergy");
+        pmeInducedForceKernel = cu.getKernel(module, "computeInducedDipoleForceAndEnergy");
+        pmeRecordInducedFieldDipolesKernel = cu.getKernel(module, "recordInducedFieldDipoles");
+        pmeSelfEnergyKernel = cu.getKernel(module, "calculateSelfEnergyAndTorque");
+        // Create the dispersion PME kernels.
+        pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dpmeAlpha);
+        pmeDefines["EPSILON_FACTOR"] = "1";
+        pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
+        pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
+        pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
+        pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dpmeAlpha*dpmeAlpha));
+        pmeDefines["CHARGE"] = "charges[atom]";
+        pmeDefines["USE_LJPME"] = "1";
+        module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
+        dpmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+        dpmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
+        dpmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+        dpmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+        dpmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
+        dpmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
+        // Create required data structures.
+        int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
+        int gridElements = gridSizeX*gridSizeY*roundedZSize;
+        roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
+        gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
+        pmeGrid1.initialize(cu, gridElements, elementSize, "pmeGrid1");
+        pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
+        cu.addAutoclearBuffer(pmeGrid1);
+        pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
+        pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
+        pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
+        dpmeBsplineModuliX.initialize(cu, dispersionGridSizeX, elementSize, "dpmeBsplineModuliX");
+        dpmeBsplineModuliY.initialize(cu, dispersionGridSizeY, elementSize, "dpmeBsplineModuliY");
+        dpmeBsplineModuliZ.initialize(cu, dispersionGridSizeZ, elementSize, "dpmeBsplineModuliZ");
+        pmePhi.initialize(cu, 20*numParticles, elementSize, "pmePhi");
+        pmePhidp.initialize(cu, 20*numParticles, elementSize, "pmePhidp");
+        pmeCphi.initialize(cu, 10*numParticles, elementSize, "pmeCphi");
+        pmeAtomGridIndex.initialize<int2>(cu, numParticles, "pmeAtomGridIndex");
+        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
+        cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
+        if (result != CUFFT_SUCCESS)
+            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+        result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
+        if (result != CUFFT_SUCCESS)
+            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+        result = cufftPlan3d(&dfftForward, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
+        if (result != CUFFT_SUCCESS)
+            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+        result = cufftPlan3d(&dfftBackward, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
+        if (result != CUFFT_SUCCESS)
+            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+        hasInitializedFFT = true;
+        // Initialize the B-spline moduli.
+        double data[PmeOrder];
+        double x = 0.0;
+        data[0] = 1.0 - x;
+        data[1] = x;
+        for (int i = 2; i < PmeOrder; i++) {
+            double denom = 1.0/i;
+            data[i] = x*data[i-1]*denom;
+            for (int j = 1; j < i; j++)
+                data[i-j] = ((x+j)*data[i-j-1] + ((i-j+1)-x)*data[i-j])*denom;
+            data[0] = (1.0-x)*data[0]*denom;
+        }
+        int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
+        vector<double> bsplines_data(maxSize+1, 0.0);
+        for (int i = 2; i <= PmeOrder+1; i++)
+            bsplines_data[i] = data[i-2];
+        for (int dim = 0; dim < 3; dim++) {
+            int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
+            vector<double> moduli(ndata);
+            // get the modulus of the discrete Fourier transform
+            double factor = 2.0*M_PI/ndata;
+            for (int i = 0; i < ndata; i++) {
+                double sc = 0.0;
+                double ss = 0.0;
+                for (int j = 1; j <= ndata; j++) {
+                    double arg = factor*i*(j-1);
+                    sc += bsplines_data[j]*cos(arg);
+                    ss += bsplines_data[j]*sin(arg);
+                }
+                moduli[i] = sc*sc+ss*ss;
+            }
+            // Fix for exponential Euler spline interpolation failure.
+            double eps = 1.0e-7;
+            if (moduli[0] < eps)
+                moduli[0] = 0.9*moduli[1];
+            for (int i = 1; i < ndata-1; i++)
+                if (moduli[i] < eps)
+                    moduli[i] = 0.9*(moduli[i-1]+moduli[i+1]);
+            if (moduli[ndata-1] < eps)
+                moduli[ndata-1] = 0.9*moduli[ndata-2];
+            // Compute and apply the optimal zeta coefficient.
+            int jcut = 50;
+            for (int i = 1; i <= ndata; i++) {
+                int k = i - 1;
+                if (i > ndata/2)
+                    k = k - ndata;
+                double zeta;
+                if (k == 0)
+                    zeta = 1.0;
+                else {
+                    double sum1 = 1.0;
+                    double sum2 = 1.0;
+                    factor = M_PI*k/ndata;
+                    for (int j = 1; j <= jcut; j++) {
+                        double arg = factor/(factor+M_PI*j);
+                        sum1 += pow(arg, PmeOrder);
+                        sum2 += pow(arg, 2*PmeOrder);
+                    }
+                    for (int j = 1; j <= jcut; j++) {
+                        double arg = factor/(factor-M_PI*j);
+                        sum1 += pow(arg, PmeOrder);
+                        sum2 += pow(arg, 2*PmeOrder);
+                    }
+                    zeta = sum2/sum1;
+                }
+                moduli[i-1] = moduli[i-1]*zeta*zeta;
+            }
+            if (cu.getUseDoublePrecision()) {
+                if (dim == 0)
+                    pmeBsplineModuliX.upload(moduli);
+                else if (dim == 1)
+                    pmeBsplineModuliY.upload(moduli);
+                else
+                    pmeBsplineModuliZ.upload(moduli);
+            }
+            else {
+                vector<float> modulif(ndata);
+                for (int i = 0; i < ndata; i++)
+                    modulif[i] = (float) moduli[i];
+                if (dim == 0)
+                    pmeBsplineModuliX.upload(modulif);
+                else if (dim == 1)
+                    pmeBsplineModuliY.upload(modulif);
+                else
+                    pmeBsplineModuliZ.upload(modulif);
+            }
+        }
+        // Initialize the b-spline moduli for dispersion PME.
+        maxSize = max(max(dispersionGridSizeX, dispersionGridSizeY), dispersionGridSizeZ);
+        vector<double> ddata(PmeOrder);
+        bsplines_data.resize(maxSize);
+        data[PmeOrder-1] = 0.0;
+        data[1] = 0.0;
+        data[0] = 1.0;
+        for (int i = 3; i < PmeOrder; i++) {
+            double div = 1.0/(i-1.0);
+            data[i-1] = 0.0;
+            for (int j = 1; j < (i-1); j++)
+                data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+            data[0] = div*data[0];
+        }
+        // Differentiate.
+        ddata[0] = -data[0];
+        for (int i = 1; i < PmeOrder; i++)
+            ddata[i] = data[i-1]-data[i];
+        double div = 1.0/(PmeOrder-1);
+        data[PmeOrder-1] = 0.0;
+        for (int i = 1; i < (PmeOrder-1); i++)
+            data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
+        data[0] = div*data[0];
+        for (int i = 0; i < maxSize; i++)
+            bsplines_data[i] = 0.0;
+        for (int i = 1; i <= PmeOrder; i++)
+            bsplines_data[i] = data[i-1];
+        // Evaluate the actual bspline moduli for X/Y/Z.
+        for(int dim = 0; dim < 3; dim++) {
+            int ndata = (dim == 0 ? dispersionGridSizeX : dim == 1 ? dispersionGridSizeY : dispersionGridSizeZ);
+            vector<double> moduli(ndata);
+            for (int i = 0; i < ndata; i++) {
+                double sc = 0.0;
+                double ss = 0.0;
+                for (int j = 0; j < ndata; j++) {
+                    double arg = (2.0*M_PI*i*j)/ndata;
+                    sc += bsplines_data[j]*cos(arg);
+                    ss += bsplines_data[j]*sin(arg);
+                }
+                moduli[i] = sc*sc+ss*ss;
+            }
+            for (int i = 0; i < ndata; i++)
+                if (moduli[i] < 1.0e-7)
+                    moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
+            if (dim == 0)
+                dpmeBsplineModuliX.upload(moduli, true);
+            else if (dim == 1)
+                dpmeBsplineModuliY.upload(moduli, true);
+            else
+                dpmeBsplineModuliZ.upload(moduli, true);
+        }
+    }
+    // Add the interaction to the default nonbonded kernel.
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    nb.setKernelSource(CudaAmoebaKernelSources::hippoInteractionHeader+CudaAmoebaKernelSources::hippoNonbonded);
+    nb.addArgument(CudaNonbondedUtilities::ParameterInfo("torqueBuffers", "unsigned long long", 1, torque.getElementSize(), torque.getDevicePointer(), false));
+    nb.addArgument(CudaNonbondedUtilities::ParameterInfo("extrapolatedDipole", "real3", 1, extrapolatedDipole.getElementSize(), extrapolatedDipole.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("coreCharge", "real", 1, coreCharge.getElementSize(), coreCharge.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("valenceCharge", "real", 1, valenceCharge.getElementSize(), valenceCharge.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("alpha", "real", 1, alpha.getElementSize(), alpha.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("epsilon", "real", 1, epsilon.getElementSize(), epsilon.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("damping", "real", 1, damping.getElementSize(), damping.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("c6", "real", 1, c6.getElementSize(), c6.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("pauliK", "real", 1, pauliK.getElementSize(), pauliK.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("pauliQ", "real", 1, pauliQ.getElementSize(), pauliQ.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("pauliAlpha", "real", 1, pauliAlpha.getElementSize(), pauliAlpha.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("dipole", "real", 3, labDipoles.getElementSize(), labDipoles.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("inducedDipole", "real", 3, inducedDipole.getElementSize(), inducedDipole.getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("qXX", "real", 1, labQuadrupoles[0].getElementSize(), labQuadrupoles[0].getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("qXY", "real", 1, labQuadrupoles[1].getElementSize(), labQuadrupoles[1].getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("qXZ", "real", 1, labQuadrupoles[2].getElementSize(), labQuadrupoles[2].getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("qYY", "real", 1, labQuadrupoles[3].getElementSize(), labQuadrupoles[3].getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("qYZ", "real", 1, labQuadrupoles[4].getElementSize(), labQuadrupoles[4].getDevicePointer()));
+    map<string, string> replacements;
+    replacements["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456);
+    replacements["SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
+    replacements["SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 3.0));
+    replacements["SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 4.0));
+    replacements["SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
+    replacements["MAX_EXTRAPOLATION_ORDER"] = cu.intToString(maxExtrapolationOrder);
+    replacements["EXTRAPOLATION_COEFFICIENTS_SUM"] = coefficients.str();
+    replacements["USE_EWALD"] = (usePME ? "1" : "0");
+    replacements["PME_ALPHA"] = (usePME ? cu.doubleToString(pmeAlpha) : "0");
+    replacements["DPME_ALPHA"] = (usePME ? cu.doubleToString(dpmeAlpha) : "0");
+    replacements["SQRT_PI"] = cu.doubleToString(sqrt(M_PI));
+    string interactionSource = cu.replaceStrings(CudaAmoebaKernelSources::hippoInteraction, replacements);
+    nb.addInteraction(usePME, usePME, true, force.getCutoffDistance(), exclusions, interactionSource, force.getForceGroup());
+    nb.setUsePadding(false);
+    // Create the kernel for computing exceptions.
+    if (exceptionAtoms.isInitialized()) {
+        replacements["COMPUTE_INTERACTION"] = interactionSource;
+        string exceptionsSrc = CudaKernelSources::vectorOps+CudaAmoebaKernelSources::hippoInteractionHeader+CudaAmoebaKernelSources::hippoNonbondedExceptions;
+        exceptionsSrc = cu.replaceStrings(exceptionsSrc, replacements);
+        defines["NUM_EXCEPTIONS"] = cu.intToString(exceptionAtoms.getSize());
+        module = cu.createModule(exceptionsSrc, defines);
+        computeExceptionsKernel = cu.getKernel(module, "computeNonbondedExceptions");
+    }
+    cu.addForce(new ForceInfo(force));
+    cu.addPostComputation(new TorquePostComputation(*this));
+}
+void CudaCalcHippoNonbondedForceKernel::createFieldKernel(const string& interactionSrc, vector<CudaArray*> params,
+            CudaArray& fieldBuffer, CUfunction& kernel, vector<void*>& args, CUfunction& exceptionKernel, vector<void*>& exceptionArgs,
+            CudaArray& exceptionScale) {
+    // Create the kernel source.
+    map<string, string> replacements;
+    replacements["COMPUTE_FIELD"] = interactionSrc;
+    stringstream extraArgs, atomParams, loadLocal1, loadLocal2, load1, load2, load3;
+    for (auto param : params) {
+        string name = param->getName();
+        string type = (param->getElementSize() == 4 || param->getElementSize() == 8 ? "real" : "real3");
+        extraArgs << ", const " << type << "* __restrict__ " << name;
+        atomParams << type << " " << name << ";\n";
+        loadLocal1 << "localData[localAtomIndex]." << name << " = " << name << "1;\n";
+        loadLocal2 << "localData[localAtomIndex]." << name << " = " << name << "[j];\n";
+        load1 << type << " " << name << "1 = " << name << "[atom1];\n";
+        load2 << type << " " << name << "2 = localData[atom2]." << name << ";\n";
+        load3 << type << " " << name << "2 = " << name << "[atom2];\n";
+    }
+    replacements["PARAMETER_ARGUMENTS"] = extraArgs.str();
+    replacements["ATOM_PARAMETER_DATA"] = atomParams.str();
+    replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
+    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
+    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+    replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
+    replacements["LOAD_ATOM2_PARAMETERS_FROM_GLOBAL"] = load3.str();
+    string src = cu.replaceStrings(CudaAmoebaKernelSources::hippoComputeField, replacements);
+    // Set defines and create the kernel.
+    map<string, string> defines;
+    if (usePME) {
+        defines["USE_CUTOFF"] = "1";
+        defines["USE_PERIODIC"] = "1";
+        defines["USE_EWALD"] = "1";
+        defines["PME_ALPHA"] = cu.doubleToString(pmeAlpha);
+        defines["SQRT_PI"] = cu.doubleToString(sqrt(M_PI));
+    }
+    defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
+    defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
+    defines["CUTOFF"] = cu.doubleToString(cutoff);
+    defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
+    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+    defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
+    defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(cu.getNonbondedUtilities().getExclusionTiles().getSize());
+    defines["NUM_EXCEPTIONS"] = cu.intToString(exceptionAtoms.isInitialized() ? exceptionAtoms.getSize() : 0);
+    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+src, defines);
+    kernel = cu.getKernel(module, "computeField");
+    // Build the list of arguments.
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    args.push_back(&cu.getPosq().getDevicePointer());
+    args.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
+    args.push_back(&cu.getNonbondedUtilities().getExclusionTiles().getDevicePointer());
+    args.push_back(&fieldBuffer.getDevicePointer());
+    if (nb.getUseCutoff()) {
+        args.push_back(&nb.getInteractingTiles().getDevicePointer());
+        args.push_back(&nb.getInteractionCount().getDevicePointer());
+        args.push_back(cu.getPeriodicBoxSizePointer());
+        args.push_back(cu.getInvPeriodicBoxSizePointer());
+        args.push_back(cu.getPeriodicBoxVecXPointer());
+        args.push_back(cu.getPeriodicBoxVecYPointer());
+        args.push_back(cu.getPeriodicBoxVecZPointer());
+        args.push_back(&maxTiles);
+        args.push_back(&nb.getBlockCenters().getDevicePointer());
+        args.push_back(&nb.getBlockBoundingBoxes().getDevicePointer());
+        args.push_back(&nb.getInteractingAtoms().getDevicePointer());
+    }
+    else
+        args.push_back(&maxTiles);
+    for (auto param : params)
+        args.push_back(&param->getDevicePointer());
+    // If there are any exceptions, build the kernel and arguments to compute them.
+    if (exceptionAtoms.isInitialized()) {
+        exceptionKernel = cu.getKernel(module, "computeFieldExceptions");
+        exceptionArgs.push_back(&cu.getPosq().getDevicePointer());
+        exceptionArgs.push_back(&fieldBuffer.getDevicePointer());
+        exceptionArgs.push_back(&exceptionAtoms.getDevicePointer());
+        exceptionArgs.push_back(&exceptionScale.getDevicePointer());
+        if (nb.getUseCutoff()) {
+            exceptionArgs.push_back(cu.getPeriodicBoxSizePointer());
+            exceptionArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+            exceptionArgs.push_back(cu.getPeriodicBoxVecXPointer());
+            exceptionArgs.push_back(cu.getPeriodicBoxVecYPointer());
+            exceptionArgs.push_back(cu.getPeriodicBoxVecZPointer());
+        }
+        for (auto param : params)
+            exceptionArgs.push_back(&param->getDevicePointer());
+    }
+}
+double CudaCalcHippoNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+        // These kernels can't be compiled in initialize(), because the nonbonded utilities object
+        // has not yet been initialized then.
+        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
+        createFieldKernel(CudaAmoebaKernelSources::hippoFixedField, {&coreCharge, &valenceCharge, &alpha, &labDipoles, &labQuadrupoles[0],
+                &labQuadrupoles[1], &labQuadrupoles[2], &labQuadrupoles[3], &labQuadrupoles[4]}, field, fixedFieldKernel, fixedFieldArgs,
+                fixedFieldExceptionKernel, fixedFieldExceptionArgs, exceptionScales[1]);
+        createFieldKernel(CudaAmoebaKernelSources::hippoMutualField, {&alpha, &inducedDipole}, inducedField, mutualFieldKernel, mutualFieldArgs,
+                mutualFieldExceptionKernel, mutualFieldExceptionArgs, exceptionScales[2]);
+        if (exceptionAtoms.isInitialized()) {
+            computeExceptionsArgs.push_back(&cu.getForce().getDevicePointer());
+            computeExceptionsArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
+            computeExceptionsArgs.push_back(&torque.getDevicePointer());
+            computeExceptionsArgs.push_back(&cu.getPosq().getDevicePointer());
+            computeExceptionsArgs.push_back(&extrapolatedDipole.getDevicePointer());
+            computeExceptionsArgs.push_back(&exceptionAtoms.getDevicePointer());
+            computeExceptionsArgs.push_back(&exceptionScales[0].getDevicePointer());
+            computeExceptionsArgs.push_back(&exceptionScales[1].getDevicePointer());
+            computeExceptionsArgs.push_back(&exceptionScales[2].getDevicePointer());
+            computeExceptionsArgs.push_back(&exceptionScales[3].getDevicePointer());
+            computeExceptionsArgs.push_back(&exceptionScales[4].getDevicePointer());
+            computeExceptionsArgs.push_back(&coreCharge.getDevicePointer());
+            computeExceptionsArgs.push_back(&valenceCharge.getDevicePointer());
+            computeExceptionsArgs.push_back(&alpha.getDevicePointer());
+            computeExceptionsArgs.push_back(&epsilon.getDevicePointer());
+            computeExceptionsArgs.push_back(&damping.getDevicePointer());
+            computeExceptionsArgs.push_back(&c6.getDevicePointer());
+            computeExceptionsArgs.push_back(&pauliK.getDevicePointer());
+            computeExceptionsArgs.push_back(&pauliQ.getDevicePointer());
+            computeExceptionsArgs.push_back(&pauliAlpha.getDevicePointer());
+            computeExceptionsArgs.push_back(&labDipoles.getDevicePointer());
+            computeExceptionsArgs.push_back(&inducedDipole.getDevicePointer());
+            computeExceptionsArgs.push_back(&labQuadrupoles[0].getDevicePointer());
+            computeExceptionsArgs.push_back(&labQuadrupoles[1].getDevicePointer());
+            computeExceptionsArgs.push_back(&labQuadrupoles[2].getDevicePointer());
+            computeExceptionsArgs.push_back(&labQuadrupoles[3].getDevicePointer());
+            computeExceptionsArgs.push_back(&labQuadrupoles[4].getDevicePointer());
+            computeExceptionsArgs.push_back(&extrapolatedDipole.getDevicePointer());
+            if (nb.getUseCutoff()) {
+                computeExceptionsArgs.push_back(cu.getPeriodicBoxSizePointer());
+                computeExceptionsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+                computeExceptionsArgs.push_back(cu.getPeriodicBoxVecXPointer());
+                computeExceptionsArgs.push_back(cu.getPeriodicBoxVecYPointer());
+                computeExceptionsArgs.push_back(cu.getPeriodicBoxVecZPointer());
+            }
+        }
+    }
+    // Make sure the arrays for the neighbor list haven't been recreated.
+    if (nb.getUseCutoff()) {
+        if (maxTiles < nb.getInteractingTiles().getSize()) {
+            maxTiles = nb.getInteractingTiles().getSize();
+            fixedFieldArgs[4] = &nb.getInteractingTiles().getDevicePointer();
+            fixedFieldArgs[14] = &nb.getInteractingAtoms().getDevicePointer();
+            mutualFieldArgs[4] = &nb.getInteractingTiles().getDevicePointer();
+            mutualFieldArgs[14] = &nb.getInteractingAtoms().getDevicePointer();
+        }
+    }
+    // Compute the lab frame moments.
+    void* computeMomentsArgs[] = {&cu.getPosq().getDevicePointer(), &multipoleParticles.getDevicePointer(),
+        &localDipoles.getDevicePointer(), &localQuadrupoles.getDevicePointer(),
+        &labDipoles.getDevicePointer(), &labQuadrupoles[0].getDevicePointer(),
+        &labQuadrupoles[1].getDevicePointer(), &labQuadrupoles[2].getDevicePointer(),
+        &labQuadrupoles[3].getDevicePointer(), &labQuadrupoles[4].getDevicePointer()};
+    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getNumAtoms());
+    void* recipBoxVectorPointer[3];
+    if (usePME) {
+        // Compute reciprocal box vectors.
+        Vec3 boxVectors[3];
+        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
+        double scale = 1.0/determinant;
+        double3 recipBoxVectors[3];
+        recipBoxVectors[0] = make_double3(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0);
+        recipBoxVectors[1] = make_double3(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0);
+        recipBoxVectors[2] = make_double3((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale);
+        float3 recipBoxVectorsFloat[3];
+        if (cu.getUseDoublePrecision()) {
+            recipBoxVectorPointer[0] = &recipBoxVectors[0];
+            recipBoxVectorPointer[1] = &recipBoxVectors[1];
+            recipBoxVectorPointer[2] = &recipBoxVectors[2];
+        }
+        else {
+            recipBoxVectorsFloat[0] = make_float3((float) recipBoxVectors[0].x, 0, 0);
+            recipBoxVectorsFloat[1] = make_float3((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0);
+            recipBoxVectorsFloat[2] = make_float3((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z);
+            recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
+            recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
+            recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
+        }
+        // Reciprocal space calculation for electrostatics.
+        void* pmeTransformMultipolesArgs[] = {&labDipoles.getDevicePointer(), &labQuadrupoles[0].getDevicePointer(),
+            &labQuadrupoles[1].getDevicePointer(), &labQuadrupoles[2].getDevicePointer(),
+            &labQuadrupoles[3].getDevicePointer(), &labQuadrupoles[4].getDevicePointer(),
+            &fracDipoles.getDevicePointer(), &fracQuadrupoles.getDevicePointer(),
+            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms());
+        void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles.getDevicePointer(), &fracQuadrupoles.getDevicePointer(),
+            &pmeGrid1.getDevicePointer(), &coreCharge.getDevicePointer(), &valenceCharge.getDevicePointer(),
+            cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
+        if (cu.getUseDoublePrecision()) {
+            void* finishSpreadArgs[] = {&pmeGrid1.getDevicePointer()};
+            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid1.getSize());
+            cufftExecD2Z(fftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
+        }
+        else
+            cufftExecR2C(fftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
+        void* pmeConvolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(),
+            &pmeBsplineModuliZ.getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
+        if (cu.getUseDoublePrecision())
+            cufftExecZ2D(fftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
+        else
+            cufftExecC2R(fftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*) pmeGrid1.getDevicePointer());
+        void* pmeFixedPotentialArgs[] = {&pmeGrid1.getDevicePointer(), &pmePhi.getDevicePointer(), &field.getDevicePointer(),
+            &cu.getPosq().getDevicePointer(), &labDipoles.getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
+            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
+        void* pmeTransformFixedPotentialArgs[] = {&pmePhi.getDevicePointer(), &pmeCphi.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms());
+        void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque.getDevicePointer(),
+            &cu.getEnergyBuffer().getDevicePointer(), &labDipoles.getDevicePointer(), &coreCharge.getDevicePointer(),
+            &valenceCharge.getDevicePointer(), &labQuadrupoles[0].getDevicePointer(),
+            &labQuadrupoles[1].getDevicePointer(), &labQuadrupoles[2].getDevicePointer(),
+            &labQuadrupoles[3].getDevicePointer(), &labQuadrupoles[4].getDevicePointer(),
+            &fracDipoles.getDevicePointer(), &fracQuadrupoles.getDevicePointer(), &pmePhi.getDevicePointer(), &pmeCphi.getDevicePointer(),
+            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeFixedForceKernel, pmeFixedForceArgs, cu.getNumAtoms());
+        // Reciprocal space calculation for dispersion.
+        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(dpmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
+        sort->sort(pmeAtomGridIndex);
+        cu.clearBuffer(pmeGrid2);
+        void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                &c6.getDevicePointer()};
+        cu.executeKernel(dpmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
+        void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
+        cu.executeKernel(dpmeFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
+        if (cu.getUseDoublePrecision())
+            cufftExecD2Z(dfftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
+        else
+            cufftExecR2C(dfftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
+        if (includeEnergy) {
+            void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
+                    &dpmeBsplineModuliX.getDevicePointer(), &dpmeBsplineModuliY.getDevicePointer(), &dpmeBsplineModuliZ.getDevicePointer(),
+                    cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(dpmeEvalEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
+        }
+        void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
+                &dpmeBsplineModuliX.getDevicePointer(), &dpmeBsplineModuliY.getDevicePointer(), &dpmeBsplineModuliZ.getDevicePointer(),
+                cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(dpmeConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
+        if (cu.getUseDoublePrecision())
+            cufftExecZ2D(dfftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
+        else
+            cufftExecC2R(dfftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*)  pmeGrid1.getDevicePointer());
+        void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                &c6.getDevicePointer()};
+        cu.executeKernel(dpmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
+    }
+    // Compute the field from fixed multipoles.
+    cu.executeKernel(fixedFieldKernel, &fixedFieldArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
+    if (fixedFieldExceptionArgs.size() > 0)
+        cu.executeKernel(fixedFieldExceptionKernel, &fixedFieldExceptionArgs[0], exceptionAtoms.getSize());
+    // Iterate the induced dipoles.
+    computeExtrapolatedDipoles(recipBoxVectorPointer);
+    // Add the polarization energy.
+    if (includeEnergy) {
+        void* polarizationEnergyArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &inducedDipole.getDevicePointer(),
+            &extrapolatedDipole.getDevicePointer(), &polarizability.getDevicePointer()};
+        cu.executeKernel(polarizationEnergyKernel, polarizationEnergyArgs, cu.getNumAtoms());
+    }
+    // Compute the forces due to the reciprocal space PME calculation for induced dipoles.
+    if (usePME) {
+        void* pmeTransformInducedPotentialArgs[] = {&pmePhidp.getDevicePointer(), &pmeCphi.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformInducedPotentialArgs, cu.getNumAtoms());
+        void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque.getDevicePointer(),
+            &cu.getEnergyBuffer().getDevicePointer(), &labDipoles.getDevicePointer(), &coreCharge.getDevicePointer(),
+            &valenceCharge.getDevicePointer(), &extrapolatedDipole.getDevicePointer(), &extrapolatedPhi.getDevicePointer(),
+            &labQuadrupoles[0].getDevicePointer(), &labQuadrupoles[1].getDevicePointer(), &labQuadrupoles[2].getDevicePointer(),
+            &labQuadrupoles[3].getDevicePointer(), &labQuadrupoles[4].getDevicePointer(), &fracDipoles.getDevicePointer(),
+            &fracQuadrupoles.getDevicePointer(), &inducedDipole.getDevicePointer(), &pmePhi.getDevicePointer(), &pmePhidp.getDevicePointer(),
+            &pmeCphi.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms());
+        void* pmeSelfEnergyArgs[] = {&torque.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
+            &labDipoles.getDevicePointer(), &coreCharge.getDevicePointer(), &valenceCharge.getDevicePointer(), &c6.getDevicePointer(),
+            &inducedDipole.getDevicePointer(), &labQuadrupoles[0].getDevicePointer(), &labQuadrupoles[1].getDevicePointer(),
+            &labQuadrupoles[2].getDevicePointer(), &labQuadrupoles[3].getDevicePointer(), &labQuadrupoles[4].getDevicePointer()};
+        cu.executeKernel(pmeSelfEnergyKernel, pmeSelfEnergyArgs, cu.getNumAtoms());
+    }
+    // Compute nonbonded exceptions.
+    if (exceptionAtoms.isInitialized())
+        cu.executeKernel(computeExceptionsKernel, &computeExceptionsArgs[0], exceptionAtoms.getSize());
+    // Record the current atom positions so we can tell later if they have changed.
+    cu.getPosq().copyTo(lastPositions);
+    multipolesAreValid = true;
+    return 0.0;
+}
+void CudaCalcHippoNonbondedForceKernel::computeInducedField(void** recipBoxVectorPointer, int optOrder) {
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    cu.clearBuffer(inducedField);
+    cu.executeKernel(mutualFieldKernel, &mutualFieldArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
+    if (mutualFieldExceptionArgs.size() > 0)
+        cu.executeKernel(mutualFieldExceptionKernel, &mutualFieldExceptionArgs[0], exceptionAtoms.getSize());
+    if (usePME) {
+        cu.clearBuffer(pmeGrid1);
+        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole.getDevicePointer(),
+            &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
+        if (cu.getUseDoublePrecision()) {
+            void* finishSpreadArgs[] = {&pmeGrid1.getDevicePointer()};
+            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid1.getSize());
+            cufftExecD2Z(fftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
+        }
+        else
+            cufftExecR2C(fftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
+        void* pmeConvolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(),
+            &pmeBsplineModuliZ.getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
+        if (cu.getUseDoublePrecision())
+            cufftExecZ2D(fftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
+        else
+            cufftExecC2R(fftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*) pmeGrid1.getDevicePointer());
+        void* pmeInducedPotentialArgs[] = {&pmeGrid1.getDevicePointer(), &extrapolatedPhi.getDevicePointer(), &optOrder,
+            &pmePhidp.getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
+            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
+        void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhidp.getDevicePointer(), &inducedField.getDevicePointer(),
+            &inducedDipole.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+        cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
+    }
+}
+void CudaCalcHippoNonbondedForceKernel::computeExtrapolatedDipoles(void** recipBoxVectorPointer) {
+    // Start by storing the direct dipoles as PT0
+    void* recordInducedDipolesArgs[] = {&field.getDevicePointer(), &inducedDipole.getDevicePointer(), &polarizability.getDevicePointer()};
+    cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
+    void* initArgs[] = {&inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer()};
+    cu.executeKernel(initExtrapolatedKernel, initArgs, extrapolatedDipole.getSize());
+    // Recursively apply alpha.Tau to the µ_(n) components to generate µ_(n+1), and store the result
+    for (int order = 1; order < maxExtrapolationOrder; ++order) {
+        computeInducedField(recipBoxVectorPointer, order-1);
+        void* iterateArgs[] = {&order, &inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer(), &inducedField.getDevicePointer(), &polarizability.getDevicePointer()};
+        cu.executeKernel(iterateExtrapolatedKernel, iterateArgs, extrapolatedDipole.getSize());
+    }
+    // Take a linear combination of the µ_(n) components to form the total dipole
+    void* computeArgs[] = {&inducedDipole.getDevicePointer(), &extrapolatedDipole.getDevicePointer()};
+    cu.executeKernel(computeExtrapolatedKernel, computeArgs, extrapolatedDipole.getSize());
+    computeInducedField(recipBoxVectorPointer, maxExtrapolationOrder-1);
+}
+void CudaCalcHippoNonbondedForceKernel::addTorquesToForces() {
+    void* mapTorqueArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(),
+        &cu.getPosq().getDevicePointer(), &multipoleParticles.getDevicePointer()};
+    cu.executeKernel(mapTorqueKernel, mapTorqueArgs, cu.getNumAtoms());
+}
+void CudaCalcHippoNonbondedForceKernel::getInducedDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
+    ensureMultipolesValid(context);
+    int numParticles = cu.getNumAtoms();
+    dipoles.resize(numParticles);
+    const vector<int>& order = cu.getAtomIndex();
+    if (cu.getUseDoublePrecision()) {
+        vector<double3> d;
+        inducedDipole.download(d);
+        for (int i = 0; i < numParticles; i++)
+            dipoles[order[i]] = Vec3(d[i].x, d[i].y, d[i].z);
+    }
+    else {
+        vector<float3> d;
+        inducedDipole.download(d);
+        for (int i = 0; i < numParticles; i++)
+            dipoles[order[i]] = Vec3(d[i].x, d[i].y, d[i].z);
+    }
+}
+void CudaCalcHippoNonbondedForceKernel::ensureMultipolesValid(ContextImpl& context) {
+    if (multipolesAreValid) {
+        int numParticles = cu.getNumAtoms();
+        if (cu.getUseDoublePrecision()) {
+            vector<double4> pos1, pos2;
+            cu.getPosq().download(pos1);
+            lastPositions.download(pos2);
+            for (int i = 0; i < numParticles; i++)
+                if (pos1[i].x != pos2[i].x || pos1[i].y != pos2[i].y || pos1[i].z != pos2[i].z) {
+                    multipolesAreValid = false;
+                    break;
+                }
+        }
+        else {
+            vector<float4> pos1, pos2;
+            cu.getPosq().download(pos1);
+            lastPositions.download(pos2);
+            for (int i = 0; i < numParticles; i++)
+                if (pos1[i].x != pos2[i].x || pos1[i].y != pos2[i].y || pos1[i].z != pos2[i].z) {
+                    multipolesAreValid = false;
+                    break;
+                }
+        }
+    }
+    if (!multipolesAreValid)
+        context.calcForcesAndEnergy(false, false, -1);
+}
+void CudaCalcHippoNonbondedForceKernel::getLabFramePermanentDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
+    ensureMultipolesValid(context);
+    int numParticles = cu.getNumAtoms();
+    dipoles.resize(numParticles);
+    const vector<int>& order = cu.getAtomIndex();
+    if (cu.getUseDoublePrecision()) {
+        vector<double3> labDipoleVec;
+        labDipoles.download(labDipoleVec);
+        for (int i = 0; i < numParticles; i++)
+            dipoles[order[i]] = Vec3(labDipoleVec[i].x, labDipoleVec[i].y, labDipoleVec[i].z);
+    }
+    else {
+        vector<float3> labDipoleVec;
+        labDipoles.download(labDipoleVec);
+        for (int i = 0; i < numParticles; i++)
+            dipoles[order[i]] = Vec3(labDipoleVec[i].x, labDipoleVec[i].y, labDipoleVec[i].z);
+    }
+}
+void CudaCalcHippoNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const HippoNonbondedForce& force) {
+    // Make sure the new parameters are acceptable.
+    cu.setAsCurrent();
+    if (force.getNumParticles() != cu.getNumAtoms())
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
+    // Record the per-particle parameters.
+    vector<double> coreChargeVec, valenceChargeVec, alphaVec, epsilonVec, dampingVec, c6Vec, pauliKVec, pauliQVec, pauliAlphaVec, polarizabilityVec;
+    vector<double> localDipolesVec, localQuadrupolesVec;
+    vector<int4> multipoleParticlesVec;
+    for (int i = 0; i < numParticles; i++) {
+        double charge, coreCharge, alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha, polarizability;
+        int axisType, atomX, atomY, atomZ;
+        vector<double> dipole, quadrupole;
+        force.getParticleParameters(i, charge, dipole, quadrupole, coreCharge, alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha,
+                                    polarizability, axisType, atomZ, atomX, atomY);
+        coreChargeVec.push_back(coreCharge);
+        valenceChargeVec.push_back(charge-coreCharge);
+        alphaVec.push_back(alpha);
+        epsilonVec.push_back(epsilon);
+        dampingVec.push_back(damping);
+        c6Vec.push_back(c6);
+        pauliKVec.push_back(pauliK);
+        pauliQVec.push_back(pauliQ);
+        pauliAlphaVec.push_back(pauliAlpha);
+        polarizabilityVec.push_back(polarizability);
+        multipoleParticlesVec.push_back(make_int4(atomX, atomY, atomZ, axisType));
+        for (int j = 0; j < 3; j++)
+            localDipolesVec.push_back(dipole[j]);
+        localQuadrupolesVec.push_back(quadrupole[0]);
+        localQuadrupolesVec.push_back(quadrupole[1]);
+        localQuadrupolesVec.push_back(quadrupole[2]);
+        localQuadrupolesVec.push_back(quadrupole[4]);
+        localQuadrupolesVec.push_back(quadrupole[5]);
+    }
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
+    for (int i = numParticles; i < paddedNumAtoms; i++) {
+        coreChargeVec.push_back(0);
+        valenceChargeVec.push_back(0);
+        alphaVec.push_back(0);
+        epsilonVec.push_back(0);
+        dampingVec.push_back(0);
+        c6Vec.push_back(0);
+        pauliKVec.push_back(0);
+        pauliQVec.push_back(0);
+        pauliAlphaVec.push_back(0);
+        polarizabilityVec.push_back(0);
+        multipoleParticlesVec.push_back(make_int4(0, 0, 0, 0));
+        for (int j = 0; j < 3; j++)
+            localDipolesVec.push_back(0);
+        for (int j = 0; j < 5; j++)
+            localQuadrupolesVec.push_back(0);
+    }
+    coreCharge.upload(coreChargeVec, true);
+    valenceCharge.upload(valenceChargeVec, true);
+    alpha.upload(alphaVec, true);
+    epsilon.upload(epsilonVec, true);
+    damping.upload(dampingVec, true);
+    c6.upload(c6Vec, true);
+    pauliK.upload(pauliKVec, true);
+    pauliQ.upload(pauliQVec, true);
+    pauliAlpha.upload(pauliAlphaVec, true);
+    polarizability.upload(polarizabilityVec, true);
+    multipoleParticles.upload(multipoleParticlesVec);
+    localDipoles.upload(localDipolesVec, true);
+    localQuadrupoles.upload(localQuadrupolesVec, true);
+    // Record the per-exception parameters.
+    vector<double> exceptionScaleVec[5];
+    vector<int2> exceptionAtomsVec;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale;
+        force.getExceptionParameters(i, particle1, particle2, multipoleMultipoleScale, dipoleMultipoleScale, dipoleDipoleScale, dispersionScale, repulsionScale);
+        if (usePME || multipoleMultipoleScale != 0 || dipoleMultipoleScale != 0 || dipoleDipoleScale != 0 || dispersionScale != 0 || repulsionScale != 0) {
+            exceptionAtomsVec.push_back(make_int2(particle1, particle2));
+            exceptionScaleVec[0].push_back(multipoleMultipoleScale);
+            exceptionScaleVec[1].push_back(dipoleMultipoleScale);
+            exceptionScaleVec[2].push_back(dipoleDipoleScale);
+            exceptionScaleVec[3].push_back(dispersionScale);
+            exceptionScaleVec[4].push_back(repulsionScale);
+        }
+    }
+    if (exceptionAtomsVec.size() > 0) {
+        if (!exceptionAtoms.isInitialized() || exceptionAtoms.getSize() != exceptionAtomsVec.size())
+            throw OpenMMException("updateParametersInContext: The number of exceptions has changed");
+        exceptionAtoms.upload(exceptionAtomsVec);
+        for (int i = 0; i < 5; i++)
+            exceptionScales[i].upload(exceptionScaleVec[i], true);
+    }
+    else if (exceptionAtoms.isInitialized())
+        throw OpenMMException("updateParametersInContext: The number of exceptions has changed");
+    cu.invalidateMolecules();
+    multipolesAreValid = false;
+}
+void CudaCalcHippoNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    alpha = pmeAlpha;
+    nx = gridSizeX;
+    ny = gridSizeY;
+    nz = gridSizeZ;
+}
+void CudaCalcHippoNonbondedForceKernel::getDPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    alpha = dpmeAlpha;
+    nx = dispersionGridSizeX;
+    ny = dispersionGridSizeY;
+    nz = dispersionGridSizeZ;
+}
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -32,6 +32,7 @@
 #include "openmm/System.h"
 #include "CudaArray.h"
 #include "CudaContext.h"
+#include "CudaNonbondedUtilities.h"
 #include "CudaSort.h"
 #include <cufft.h>
@@ -381,16 +382,6 @@ public:
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class ForceInfo;
-    class SortTrait : public CudaSort::SortTrait {
-        int getDataSize() const {return 8;}
-        int getKeySize() const {return 4;}
-        const char* getDataType() const {return "int2";}
-        const char* getKeyType() const {return "int";}
-        const char* getMinKey() const {return "(-2147483647 - 1)";}
-        const char* getMaxKey() const {return "2147483647";}
-        const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
-        const char* getSortKey() const {return "value.y";}
-    };
    void initializeScaleFactors();
    void computeInducedField(void** recipBoxVectorPointer);
    bool iterateDipolesByDIIS(int iteration);
@@ -451,14 +442,12 @@ private:
    CudaArray pmeBsplineModuliX;
    CudaArray pmeBsplineModuliY;
    CudaArray pmeBsplineModuliZ;
-    CudaArray pmeIgrid;
    CudaArray pmePhi;
    CudaArray pmePhid;
    CudaArray pmePhip;
    CudaArray pmePhidp;
    CudaArray pmeCphi;
    CudaArray lastPositions;
-    CudaSort* sort;
    cufftHandle fft;
    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
    CUfunction pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
@@ -630,6 +619,136 @@ private:
    CUfunction forceKernel;
 };
+/**
+ * This kernel is invoked by HippoNonbondedForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaCalcHippoNonbondedForceKernel : public CalcHippoNonbondedForceKernel {
+public:
+    CudaCalcHippoNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system);
+    ~CudaCalcHippoNonbondedForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the HippoNonbondedForce this kernel will be used for
+     */
+    void initialize(const System& system, const HippoNonbondedForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Get the induced dipole moments of all particles.
+     * 
+     * @param context    the Context for which to get the induced dipoles
+     * @param dipoles    the induced dipole moment of particle i is stored into the i'th element
+     */
+    void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
+    /**
+     * Get the fixed dipole moments of all particles in the global reference frame.
+     * 
+     * @param context    the Context for which to get the fixed dipoles
+     * @param dipoles    the fixed dipole moment of particle i is stored into the i'th element
+     */
+    void getLabFramePermanentDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
+    /** 
+     * Calculate the electrostatic potential given vector of grid coordinates.
+     *
+     * @param context                      context
+     * @param inputGrid                    input grid coordinates
+     * @param outputElectrostaticPotential output potential 
+     */
+    void getElectrostaticPotential(ContextImpl& context, const std::vector< Vec3 >& inputGrid,
+                                   std::vector< double >& outputElectrostaticPotential);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the HippoNonbondedForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const HippoNonbondedForce& force);
+    /**
+     * Get the parameters being used for PME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for dispersion PME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getDPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+private:
+    class ForceInfo;
+    class TorquePostComputation;
+    class SortTrait : public CudaSort::SortTrait {
+        int getDataSize() const {return 8;}
+        int getKeySize() const {return 4;}
+        const char* getDataType() const {return "int2";}
+        const char* getKeyType() const {return "int";}
+        const char* getMinKey() const {return "(-2147483647-1)";}
+        const char* getMaxKey() const {return "2147483647";}
+        const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
+        const char* getSortKey() const {return "value.y";}
+    };
+    void computeInducedField(void** recipBoxVectorPointer, int optOrder);
+    void computeExtrapolatedDipoles(void** recipBoxVectorPointer);
+    void ensureMultipolesValid(ContextImpl& context);
+    void addTorquesToForces();
+    void createFieldKernel(const std::string& interactionSrc, std::vector<CudaArray*> params, CudaArray& fieldBuffer,
+        CUfunction& kernel, std::vector<void*>& args, CUfunction& exceptionKernel, std::vector<void*>& exceptionArgs,
+        CudaArray& exceptionScale);
+    int numParticles, maxExtrapolationOrder, maxTiles;
+    int gridSizeX, gridSizeY, gridSizeZ;
+    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
+    double pmeAlpha, dpmeAlpha, cutoff;
+    bool usePME, hasInitializedKernels, hasInitializedFFT, multipolesAreValid;
+    std::vector<double> extrapolationCoefficients;
+    CudaContext& cu;
+    const System& system;
+    CudaArray multipoleParticles;
+    CudaArray coreCharge, valenceCharge, alpha, epsilon, damping, c6, pauliK, pauliQ, pauliAlpha, polarizability;
+    CudaArray localDipoles, labDipoles, fracDipoles;
+    CudaArray localQuadrupoles, labQuadrupoles[5], fracQuadrupoles;
+    CudaArray field;
+    CudaArray inducedField;
+    CudaArray torque;
+    CudaArray inducedDipole;
+    CudaArray extrapolatedDipole, extrapolatedPhi;
+    CudaArray pmeGrid1, pmeGrid2;
+    CudaArray pmeAtomGridIndex;
+    CudaArray pmeBsplineModuliX, pmeBsplineModuliY, pmeBsplineModuliZ;
+    CudaArray dpmeBsplineModuliX, dpmeBsplineModuliY, dpmeBsplineModuliZ;
+    CudaArray pmePhi, pmePhidp, pmeCphi;
+    CudaArray lastPositions;
+    CudaArray exceptionScales[5];
+    CudaArray exceptionAtoms;
+    CudaSort* sort;
+    cufftHandle fftForward, fftBackward, dfftForward, dfftBackward;
+    CUfunction computeMomentsKernel, fixedFieldKernel, fixedFieldExceptionKernel, mutualFieldKernel, mutualFieldExceptionKernel, computeExceptionsKernel;
+    CUfunction recordInducedDipolesKernel, mapTorqueKernel;
+    CUfunction pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
+    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel;
+    CUfunction pmeSelfEnergyKernel;
+    CUfunction dpmeGridIndexKernel, dpmeSpreadChargeKernel, dpmeFinishSpreadChargeKernel, dpmeEvalEnergyKernel, dpmeConvolutionKernel, dpmeInterpolateForceKernel;
+    CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, polarizationEnergyKernel;
+    CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
+    std::vector<void*> fixedFieldArgs, fixedFieldExceptionArgs, mutualFieldArgs, mutualFieldExceptionArgs, computeExceptionsArgs;
+    static const int PmeOrder = 5;
+};
 } // namespace OpenMM
 #endif /*AMOEBA_OPENMM_CUDAKERNELS_H*/
--- a/plugins/amoeba/platforms/cuda/src/kernels/hippoComputeField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/hippoComputeField.cu
+__device__ void computeDirectFieldDampingFactors(real alpha, real r, real& fdamp3, real& fdamp5, real& fdamp7) {
+    real ar = alpha*r;
+    real ar2 = ar*ar;
+    real ar3 = ar2*ar;
+    real ar4 = ar2*ar2;
+    real expAR = EXP(-ar);
+    real one = 1;
+    fdamp3 = 1 - (1 + ar + ar2*(one/2))*expAR;
+    fdamp5 = 1 - (1 + ar + ar2*(one/2) + ar3*(one/6))*expAR;
+    fdamp7 = 1 - (1 + ar + ar2*(one/2) + ar3*(one/6) + ar4*(one/30))*expAR;
+}
+__device__ void computeMutualFieldDampingFactors(real alphaI, real alphaJ, real r, real& fdamp3, real& fdamp5) {
+    real arI = alphaI*r;
+    real arI2 = arI*arI;
+    real arI3 = arI2*arI;
+    real expARI = EXP(-arI);
+    real one = 1;
+    real seven = 7;
+    if (alphaI == alphaJ) {
+        real arI4 = arI3*arI;
+        real arI5 = arI4*arI;
+        fdamp3 = 1 - (1 + arI + arI2*(one/2) + arI3*(seven/48) + arI4*(one/48))*expARI;
+        fdamp5 = 1 - (1 + arI + arI2*(one/2) + arI3*(one/6) + arI4*(one/24) + arI5*(one/144))*expARI;
+    }
+    else {
+        real arJ = alphaJ*r;
+        real arJ2 = arJ*arJ;
+        real arJ3 = arJ2*arJ;
+        real expARJ = EXP(-arJ);
+        real aI2 = alphaI*alphaI;
+        real aJ2 = alphaJ*alphaJ;
+        real A = aJ2/(aJ2-aI2);
+        real B = aI2/(aI2-aJ2);
+        real A2 = A*A;
+        real B2 = B*B;
+        fdamp3 = 1 - A2*(1 + arI + arI2*(one/2))*expARI -
+                     B2*(1 + arJ + arJ2*(one/2))*expARJ -
+                     2*A2*B*(1 + arI)*expARI -
+                     2*B2*A*(1 + arJ)*expARJ;
+        fdamp5 = 1 - A2*(1 + arI + arI2*(one/2) + arI3*(one/6))*expARI -
+                     B2*(1 + arJ + arJ2*(one/2) + arJ3*(one/6))*expARJ -
+                     2*A2*B*(1 + arI + arI2*(one/3))*expARI -
+                     2*B2*A*(1 + arJ + arJ2*(one/3))*expARJ;
+    }
+}
+typedef struct {
+    real3 pos;
+    real3 field;
+    ATOM_PARAMETER_DATA
+} AtomData;
+/**
+ * Compute the electrostatic field.
+ */
+extern "C" __global__ void computeField(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
+        const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ fieldBuffers,
+#ifdef USE_CUTOFF
+        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
+        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    const unsigned int tbx = threadIdx.x - tgx;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = warp*NUM_TILES_WITH_EXCLUSIONS/totalWarps;
+    const unsigned int lastExclusionTile = (warp+1)*NUM_TILES_WITH_EXCLUSIONS/totalWarps;
+    for (int tile = firstExclusionTile; tile < lastExclusionTile; tile++) {
+        const ushort2 tileIndices = exclusionTiles[tile];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        real3 field = make_real3(0);
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 pos1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+        unsigned int excl = exclusions[tile*TILE_SIZE+tgx];
+        if (x == y) {
+            // This tile is on the diagonal.
+            const unsigned int localAtomIndex = threadIdx.x;
+            localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
+            LOAD_LOCAL_PARAMETERS_FROM_1
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+j;
+                real3 pos2 = localData[atom2].pos;
+                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = r2*invR;
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    real3 tempField1 = make_real3(0);
+                    real3 tempField2 = make_real3(0);
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                    if (!isExcluded && atom1 != atom2) {
+                        COMPUTE_FIELD
+                    }
+                    field += tempField1;
+#ifdef USE_CUTOFF
+                }
+#endif
+                excl >>= 1;
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            const unsigned int localAtomIndex = threadIdx.x;
+            unsigned int j = y*TILE_SIZE + tgx;
+            real4 tempPosq = posq[j];
+            localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            localData[localAtomIndex].field = make_real3(0);
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+tj;
+                real3 pos2 = localData[atom2].pos;
+                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = r2*invR;
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+tj;
+                    real3 tempField1 = make_real3(0);
+                    real3 tempField2 = make_real3(0);
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                    if (!isExcluded) {
+                        COMPUTE_FIELD
+                    }
+                    field += tempField1;
+                    localData[tbx+tj].field += tempField2;
+#ifdef USE_CUTOFF
+                }
+#endif
+                excl >>= 1;
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+        }
+        // Write results.
+        unsigned int offset1 = x*TILE_SIZE + tgx;
+        atomicAdd(&fieldBuffers[offset1], static_cast<unsigned long long>((long long) (field.x*0x100000000)));
+        atomicAdd(&fieldBuffers[offset1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (field.y*0x100000000)));
+        atomicAdd(&fieldBuffers[offset1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (field.z*0x100000000)));
+        if (x != y) {
+            unsigned int offset2 = y*TILE_SIZE + tgx;
+            atomicAdd(&fieldBuffers[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
+            atomicAdd(&fieldBuffers[offset2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
+            atomicAdd(&fieldBuffers[offset2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
+        }
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
+    int tile = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
+    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
+#else
+    int tile = (int) (warp*(long long)numTiles/totalWarps);
+    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
+    while (tile < end) {
+        real3 field = make_real3(0);
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        x = tiles[tile];
+        real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+#else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*tile));
+        x = (tile-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            y += (x < y ? -1 : 1);
+            x = (tile-y*NUM_BLOCKS+y*(y+1)/2);
+        }
+        // Skip over tiles that have exclusions, since they were already processed.
+        while (skipTiles[tbx+TILE_SIZE-1] < tile) {
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                ushort2 tile = exclusionTiles[skipBase+tgx];
+                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+            }
+            else
+                skipTiles[threadIdx.x] = end;
+            skipBase += TILE_SIZE;            
+            currentSkipIndex = tbx;
+        }
+        while (skipTiles[currentSkipIndex] < tile)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != tile);
+#endif
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 pos1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = threadIdx.x;
+#ifdef USE_CUTOFF
+            unsigned int j = interactingAtoms[tile*TILE_SIZE+tgx];
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[threadIdx.x] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                real4 tempPosq = posq[j];
+                localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                localData[localAtomIndex].field = make_real3(0);
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x].pos, blockCenterX)
+                unsigned int tj = tgx;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real3 pos2 = localData[atom2].pos;
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    if (r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = r2*invR;
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real3 tempField1 = make_real3(0);
+                        real3 tempField2 = make_real3(0);
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_FIELD
+                        }
+                        field += tempField1;
+                        localData[tbx+tj].field += tempField2;
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real3 pos2 = localData[atom2].pos;
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#ifdef USE_PERIODIC
+                    APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = r2*invR;
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real3 tempField1 = make_real3(0);
+                        real3 tempField2 = make_real3(0);
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_FIELD
+                        }
+                        field += tempField1;
+                        localData[tbx+tj].field += tempField2;
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+            // Write results.
+            atomicAdd(&fieldBuffers[atom1], static_cast<unsigned long long>((long long) (field.x*0x100000000)));
+            atomicAdd(&fieldBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (field.y*0x100000000)));
+            atomicAdd(&fieldBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (field.z*0x100000000)));
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[threadIdx.x];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atomicAdd(&fieldBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
+                atomicAdd(&fieldBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
+                atomicAdd(&fieldBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
+            }
+        }
+        tile++;
+    }
+}
+#define COMPUTING_EXCEPTIONS
+/**
+ * Compute the electrostatic field from nonbonded exceptions.
+ */
+extern "C" __global__ void computeFieldExceptions(const real4* __restrict__ posq, unsigned long long* __restrict__ fieldBuffers,
+        const int2* __restrict__ exceptionAtoms, const real* __restrict__ exceptionScale
+#ifdef USE_CUTOFF
+        , real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
+#endif
+        PARAMETER_ARGUMENTS) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_EXCEPTIONS; index += blockDim.x*gridDim.x) {
+        int2 atoms = exceptionAtoms[index];
+        int atom1 = atoms.x;
+        int atom2 = atoms.y;
+        real4 pos1 = posq[atom1];
+        real4 pos2 = posq[atom2];
+        LOAD_ATOM1_PARAMETERS
+        LOAD_ATOM2_PARAMETERS_FROM_GLOBAL
+        real scale = exceptionScale[index];
+        real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#ifdef USE_PERIODIC
+        APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+        if (r2 < CUTOFF_SQUARED) {
+#endif
+            real invR = RSQRT(r2);
+            real r = r2*invR;
+            real3 tempField1 = make_real3(0);
+            real3 tempField2 = make_real3(0);
+            COMPUTE_FIELD
+            atomicAdd(&fieldBuffers[atom1], static_cast<unsigned long long>((long long) (tempField1.x*0x100000000)));
+            atomicAdd(&fieldBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempField1.y*0x100000000)));
+            atomicAdd(&fieldBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempField1.z*0x100000000)));
+            atomicAdd(&fieldBuffers[atom2], static_cast<unsigned long long>((long long) (tempField2.x*0x100000000)));
+            atomicAdd(&fieldBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempField2.y*0x100000000)));
+            atomicAdd(&fieldBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempField2.z*0x100000000)));
+#ifdef USE_CUTOFF
+        }
+#endif
+    }
+}
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda/src/kernels/hippoFixedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/hippoFixedField.cu
+real invR2 = invR*invR;
+real invR3 = invR*invR2;
+real invR5 = invR3*invR2;
+real invR7 = invR5*invR2;
+#if USE_EWALD
+// Calculate the error function damping terms.
+real ralpha = PME_ALPHA*r;
+real bn0 = erfc(ralpha)*invR;
+real alsq2 = 2*PME_ALPHA*PME_ALPHA;
+real alsq2n = 1/(SQRT_PI*PME_ALPHA);
+real exp2a = EXP(-(ralpha*ralpha));
+alsq2n *= alsq2;
+real bn1 = (bn0+alsq2n*exp2a)*invR2;
+alsq2n *= alsq2;
+real bn2 = (3*bn1+alsq2n*exp2a)*invR2;
+alsq2n *= alsq2;
+real bn3 = (5*bn2+alsq2n*exp2a)*invR2;
+#endif
+// Calculate the field at particle 1 due to multipoles at particle 2
+real fdamp3, fdamp5, fdamp7;
+computeDirectFieldDampingFactors(alpha2, r, fdamp3, fdamp5, fdamp7);
+#ifndef COMPUTING_EXCEPTIONS
+real scale = 1;
+#endif
+#ifdef USE_EWALD
+real rr3 = bn1 - (1-scale)*invR3;
+real rr3j = bn1 - (1-scale*fdamp3)*invR3;
+real rr5j = bn2 - (1-scale*fdamp5)*3*invR5;
+real rr7j = bn3 - (1-scale*fdamp7)*15*invR7;
+#else
+real rr3 = scale*invR3;
+real rr3j = scale*fdamp3*invR3;
+real rr5j = scale*3*fdamp5*invR5;
+real rr7j = scale*15*fdamp7*invR7;
+#endif
+real qZZ2 = -qXX2-qYY2;
+real3 qDotDelta2 = make_real3(delta.x*qXX2 + delta.y*qXY2 + delta.z*qXZ2,
+                              delta.x*qXY2 + delta.y*qYY2 + delta.z*qYZ2,
+                              delta.x*qXZ2 + delta.y*qYZ2 + delta.z*qZZ2);
+real dipoleDelta2 = dot(dipole2, delta);
+real qdpoleDelta2 = dot(qDotDelta2, delta);
+real factor2 = rr3*coreCharge2 + rr3j*valenceCharge2 - rr5j*dipoleDelta2 + rr7j*qdpoleDelta2;
+tempField1 = -delta*factor2 - dipole2*rr3j + qDotDelta2*2*rr5j;
+// Calculate the field at particle 2 due to multipoles at particle 1
+computeDirectFieldDampingFactors(alpha1, r, fdamp3, fdamp5, fdamp7);
+#ifdef USE_EWALD
+real rr3i = bn1 - (1-scale*fdamp3)*invR3;
+real rr5i = bn2 - (1-scale*fdamp5)*3*invR5;
+real rr7i = bn3 - (1-scale*fdamp7)*15*invR7;
+#else
+real rr3i = scale*fdamp3*invR3;
+real rr5i = scale*3*fdamp5*invR5;
+real rr7i = scale*15*fdamp7*invR7;
+#endif
+real qZZ1 = -qXX1-qYY1;
+real3 qDotDelta1 = make_real3(delta.x*qXX1 + delta.y*qXY1 + delta.z*qXZ1,
+                              delta.x*qXY1 + delta.y*qYY1 + delta.z*qYZ1,
+                              delta.x*qXZ1 + delta.y*qYZ1 + delta.z*qZZ1);
+real dipoleDelta1 = dot(dipole1, delta);
+real qdpoleDelta1 = dot(qDotDelta1, delta);
+real factor1 = rr3*coreCharge1 + rr3i*valenceCharge1 + rr5i*dipoleDelta1 + rr7i*qdpoleDelta1;
+tempField2 = delta*factor1 - dipole1*rr3i - qDotDelta1*2*rr5i;