Merge pull request #1 from peastman/ljpme

Cleanup to LJ PME code

Merge pull request #1 from peastman/ljpme
Cleanup to LJ PME code
3b6925ae · Andy Simmonett · GitHub · 5a8a8aa9 · f7a102fb · 3b6925ae
Commit 3b6925ae authored Jan 26, 2017 by Andy Simmonett Committed by GitHub Jan 26, 2017
20 changed files
--- a/platforms/reference/include/ReferenceVariableStochasticDynamics.h
+++ b/platforms/reference/include/ReferenceVariableStochasticDynamics.h

-/* Portions copyright (c) 2006-2012 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -35,7 +35,7 @@ class ReferenceVariableStochasticDynamics : public ReferenceDynamics {

      std::vector<OpenMM::RealVec> xPrime;
      std::vector<RealOpenMM> inverseMasses;
-      RealOpenMM _tau, _accuracy;
+      RealOpenMM friction, _accuracy;

   public:

@@ -44,13 +44,13 @@ class ReferenceVariableStochasticDynamics : public ReferenceDynamics {
         Constructor

         @param numberOfAtoms  number of atoms
-         @param tau            viscosity
+         @param friction       friction coefficient
         @param temperature    temperature
         @param accuracy       required accuracy

         --------------------------------------------------------------------------------------- */

-       ReferenceVariableStochasticDynamics(int numberOfAtoms, RealOpenMM tau, RealOpenMM temperature, RealOpenMM accuracy);
+       ReferenceVariableStochasticDynamics(int numberOfAtoms, RealOpenMM friction, RealOpenMM temperature, RealOpenMM accuracy);

      /**---------------------------------------------------------------------------------------

@@ -62,13 +62,11 @@ class ReferenceVariableStochasticDynamics : public ReferenceDynamics {

      /**---------------------------------------------------------------------------------------

-         Get tau
-
-         @return tau
+         Get friction coefficient

         --------------------------------------------------------------------------------------- */

-      RealOpenMM getTau() const;
+      RealOpenMM getFriction() const;
      
      /**---------------------------------------------------------------------------------------


--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -969,9 +969,17 @@ void ReferenceCalcNonbondedForceKernel::initialize(const System& system, const N
    }
    else if (nonbondedMethod == PME) {
        double alpha;
-        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2]);
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
        ewaldAlpha = (RealOpenMM) alpha;
    }
+    else if (nonbondedMethod == LJPME) {
+        double alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
+        ewaldAlpha = (RealOpenMM) alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], true);
+        ewaldDispersionAlpha = (RealOpenMM) alpha;
+        useSwitchingFunction = false;
+    }
    rfDielectric = (RealOpenMM)force.getReactionFieldDielectric();
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
@@ -987,11 +995,12 @@ double ReferenceCalcNonbondedForceKernel::execute(ContextImpl& context, bool inc
    bool periodic = (nonbondedMethod == CutoffPeriodic);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
+    bool ljpme = (nonbondedMethod == LJPME);
    if (nonbondedMethod != NoCutoff) {
-        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, extractBoxVectors(context), periodic || ewald || pme, nonbondedCutoff, 0.0);
+        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, extractBoxVectors(context), periodic || ewald || pme || ljpme, nonbondedCutoff, 0.0);
        clj.setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
    }
-    if (periodic || ewald || pme) {
+    if (periodic || ewald || pme || ljpme) {
        RealVec* boxVectors = extractBoxVectors(context);
        double minAllowedSize = 1.999999*nonbondedCutoff;
        if (boxVectors[0][0] < minAllowedSize || boxVectors[1][1] < minAllowedSize || boxVectors[2][2] < minAllowedSize)
@@ -1002,6 +1011,10 @@ double ReferenceCalcNonbondedForceKernel::execute(ContextImpl& context, bool inc
        clj.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
    if (pme)
        clj.setUsePME(ewaldAlpha, gridSize);
+    if (ljpme){
+        clj.setUsePME(ewaldAlpha, gridSize);
+        clj.setUseLJPME(ewaldDispersionAlpha, dispersionGridSize);
+    }
    if (useSwitchingFunction)
        clj.setUseSwitchingFunction(switchingDistance);
    clj.calculatePairIxn(numParticles, posData, particleParamArray, exclusions, 0, forceData, 0, includeEnergy ? &energy : NULL, includeDirect, includeReciprocal);
@@ -1059,14 +1072,23 @@ void ReferenceCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& con
 }

 void ReferenceCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
-    if (nonbondedMethod != PME)
-        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (nonbondedMethod != PME && nonbondedMethod != LJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME or LJPME");
    alpha = ewaldAlpha;
    nx = gridSize[0];
    ny = gridSize[1];
    nz = gridSize[2];
 }

+void ReferenceCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != LJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using LJPME");
+    alpha = ewaldDispersionAlpha;
+    nx = dispersionGridSize[0];
+    ny = dispersionGridSize[1];
+    nz = dispersionGridSize[2];
+}
+
 ReferenceCalcCustomNonbondedForceKernel::~ReferenceCalcCustomNonbondedForceKernel() {
    disposeRealArray(particleParamArray, numParticles);
    if (neighborList != NULL)
@@ -2053,11 +2075,10 @@ void ReferenceIntegrateLangevinStepKernel::execute(ContextImpl& context, const L
        
        if (dynamics)
            delete dynamics;
-        RealOpenMM tau = static_cast<RealOpenMM>(friction == 0.0 ? 0.0 : 1.0/friction);
        dynamics = new ReferenceStochasticDynamics(
                context.getSystem().getNumParticles(), 
                static_cast<RealOpenMM>(stepSize), 
-                static_cast<RealOpenMM>(tau), 
+                static_cast<RealOpenMM>(friction), 
                static_cast<RealOpenMM>(temperature));
        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
        prevTemp = temperature;
@@ -2142,8 +2163,7 @@ double ReferenceIntegrateVariableLangevinStepKernel::execute(ContextImpl& contex

        if (dynamics)
            delete dynamics;
-        RealOpenMM tau = static_cast<RealOpenMM>(friction == 0.0 ? 0.0 : 1.0/friction);
-        dynamics = new ReferenceVariableStochasticDynamics(context.getSystem().getNumParticles(), (RealOpenMM) tau, (RealOpenMM) temperature, (RealOpenMM) errorTol);
+        dynamics = new ReferenceVariableStochasticDynamics(context.getSystem().getNumParticles(), (RealOpenMM) friction, (RealOpenMM) temperature, (RealOpenMM) errorTol);
        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
        prevTemp = temperature;
        prevFriction = friction;

--- a/platforms/reference/src/SimTKReference/ReferenceCustomDynamics.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceCustomDynamics.cpp
@@ -78,11 +78,6 @@ ReferenceCustomDynamics::ReferenceCustomDynamics(int numberOfAtoms, const Custom
        string expression;
        integrator.getComputationStep(i, stepType[i], stepVariable[i], expression);
    }
-    kineticEnergyExpression = Parser::parse(integrator.getKineticEnergyExpression()).optimize().createCompiledExpression();
-    expressionSet.registerExpression(kineticEnergyExpression);
-    kineticEnergyNeedsForce = false;
-    if (kineticEnergyExpression.getVariables().find("f") != kineticEnergyExpression.getVariables().end())
-        kineticEnergyNeedsForce = true;
 }

 /**---------------------------------------------------------------------------------------
@@ -98,6 +93,28 @@ void ReferenceCustomDynamics::initialize(ContextImpl& context, vector<RealOpenMM
    // Some initialization can't be done in the constructor, since we need a ContextImpl from which to get the list of
    // Context parameters.  Instead, we do it the first time update() or computeKineticEnergy() is called.

+    std::map<std::string, double*> variableLocations;
+    variableLocations["x"] = &x;
+    variableLocations["v"] = &v;
+    variableLocations["m"] = &m;
+    variableLocations["f"] = &f;
+    variableLocations["energy"] = &energy;
+    variableLocations["gaussian"] = &gaussian;
+    variableLocations["uniform"] = &uniform;
+    perDofVariable.resize(integrator.getNumPerDofVariables());
+    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
+        variableLocations[integrator.getPerDofVariableName(i)] = &perDofVariable[i];
+    for (int i = 0; i < 32; i++) {
+        stringstream fname;
+        fname << "f" << i;
+        variableLocations[fname.str()] = &f;
+        stringstream ename;
+        ename << "energy" << i;
+        variableLocations[ename.str()] = &energy;
+    }
+    
+    // Parse the expressions.
+    
    int numSteps = stepType.size();
    vector<int> forceGroup;
    vector<vector<ParsedExpression> > expressions;
@@ -107,37 +124,25 @@ void ReferenceCustomDynamics::initialize(ContextImpl& context, vector<RealOpenMM
        stepExpressions[i].resize(expressions[i].size());
        for (int j = 0; j < (int) expressions[i].size(); j++) {
            stepExpressions[i][j] = ParsedExpression(replaceDerivFunctions(expressions[i][j].getRootNode(), context)).createCompiledExpression();
+            stepExpressions[i][j].setVariableLocations(variableLocations);
            expressionSet.registerExpression(stepExpressions[i][j]);
        }
        if (stepType[i] == CustomIntegrator::WhileBlockStart)
            blockEnd[blockEnd[i]] = i; // Record where to branch back to.
    }
+    kineticEnergyExpression = Parser::parse(integrator.getKineticEnergyExpression()).optimize().createCompiledExpression();
+    kineticEnergyExpression.setVariableLocations(variableLocations);
+    expressionSet.registerExpression(kineticEnergyExpression);
+    kineticEnergyNeedsForce = false;
+    if (kineticEnergyExpression.getVariables().find("f") != kineticEnergyExpression.getVariables().end())
+        kineticEnergyNeedsForce = true;

-    // Record the variable names and flags for the force and energy in each step.
+    // Record the force group flags for each step.

    forceGroupFlags.resize(numSteps, -1);
-    fIndex = expressionSet.getVariableIndex("f");
-    energyIndex = expressionSet.getVariableIndex("energy");
-    forceVariableIndex.resize(numSteps, fIndex);
-    energyVariableIndex.resize(numSteps, energyIndex);
-    vector<string> forceGroupName;
-    vector<string> energyGroupName;
-    for (int i = 0; i < 32; i++) {
-        stringstream fname;
-        fname << "f" << i;
-        forceGroupName.push_back(fname.str());
-        stringstream ename;
-        ename << "energy" << i;
-        energyGroupName.push_back(ename.str());
-    }
-    for (int i = 0; i < numSteps; i++) {
-        if (needsForces[i] && forceGroup[i] > -1)
-            forceVariableIndex[i] = expressionSet.getVariableIndex(forceGroupName[forceGroup[i]]);
-        if (needsEnergy[i] && forceGroup[i] > -1)
-            energyVariableIndex[i] = expressionSet.getVariableIndex(energyGroupName[forceGroup[i]]);
+    for (int i = 0; i < numSteps; i++)
        if (forceGroup[i] > -1)
            forceGroupFlags[i] = 1<<forceGroup[i];
-    }

    // Build the list of inverse masses.

@@ -150,13 +155,10 @@ void ReferenceCustomDynamics::initialize(ContextImpl& context, vector<RealOpenMM
            inverseMasses[i] = 1.0/masses[i];
    }

-    // Record indices of other variables.
+    // Record indices of variables.

    xIndex = expressionSet.getVariableIndex("x");
    vIndex = expressionSet.getVariableIndex("v");
-    mIndex = expressionSet.getVariableIndex("m");
-    gaussianIndex = expressionSet.getVariableIndex("gaussian");
-    uniformIndex = expressionSet.getVariableIndex("uniform");
    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
        perDofVariableIndex.push_back(expressionSet.getVariableIndex(integrator.getPerDofVariableName(i)));
    for (int i = 0; i < stepVariable.size(); i++)
@@ -222,15 +224,14 @@ void ReferenceCustomDynamics::update(ContextImpl& context, int numberOfAtoms, ve
            }
            forcesAreValid = true;
        }
-        expressionSet.setVariable(energyVariableIndex[step], energy);
        
        // Execute the step.

        int nextStep = step+1;
        switch (stepType[step]) {
            case CustomIntegrator::ComputeGlobal: {
-                expressionSet.setVariable(uniformIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
-                expressionSet.setVariable(gaussianIndex, SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
+                uniform = SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber();
+                gaussian = SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
                RealOpenMM result = stepExpressions[step][0].evaluate();
                globals[stepVariable[step]] = result;
                expressionSet.setVariable(stepVariableIndex[step], result);
@@ -249,11 +250,11 @@ void ReferenceCustomDynamics::update(ContextImpl& context, int numberOfAtoms, ve
                }
                if (results == NULL)
                    throw OpenMMException("Illegal per-DOF output variable: "+stepVariable[step]);
-                computePerDof(numberOfAtoms, *results, atomCoordinates, velocities, forces, masses, perDof, stepExpressions[step][0], forceVariableIndex[step]);
+                computePerDof(numberOfAtoms, *results, atomCoordinates, velocities, forces, masses, perDof, stepExpressions[step][0]);
                break;
            }
            case CustomIntegrator::ComputeSum: {
-                computePerDof(numberOfAtoms, sumBuffer, atomCoordinates, velocities, forces, masses, perDof, stepExpressions[step][0], forceVariableIndex[step]);
+                computePerDof(numberOfAtoms, sumBuffer, atomCoordinates, velocities, forces, masses, perDof, stepExpressions[step][0]);
                RealOpenMM sum = 0.0;
                for (int j = 0; j < numberOfAtoms; j++)
                    if (masses[j] != 0.0)
@@ -306,22 +307,22 @@ void ReferenceCustomDynamics::update(ContextImpl& context, int numberOfAtoms, ve

 void ReferenceCustomDynamics::computePerDof(int numberOfAtoms, vector<RealVec>& results, const vector<RealVec>& atomCoordinates,
              const vector<RealVec>& velocities, const vector<RealVec>& forces, const vector<RealOpenMM>& masses,
-              const vector<vector<RealVec> >& perDof, const CompiledExpression& expression, int forceIndex) {
+              const vector<vector<RealVec> >& perDof, const CompiledExpression& expression) {
    // Loop over all degrees of freedom.

    for (int i = 0; i < numberOfAtoms; i++) {
        if (masses[i] != 0.0) {
-            expressionSet.setVariable(mIndex, masses[i]);
+            m = masses[i];
            for (int j = 0; j < 3; j++) {
                // Compute the expression.

-                expressionSet.setVariable(xIndex, atomCoordinates[i][j]);
-                expressionSet.setVariable(vIndex, velocities[i][j]);
-                expressionSet.setVariable(forceIndex, forces[i][j]);
-                expressionSet.setVariable(uniformIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
-                expressionSet.setVariable(gaussianIndex, SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
+                x = atomCoordinates[i][j];
+                v = velocities[i][j];
+                f = forces[i][j];
+                uniform = SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber();
+                gaussian = SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
                for (int k = 0; k < (int) perDof.size(); k++)
-                    expressionSet.setVariable(perDofVariableIndex[k], perDof[k][i][j]);
+                    perDofVariable[k] = perDof[k][i][j];
                results[i][j] = expression.evaluate();
            }
        }
@@ -329,8 +330,8 @@ void ReferenceCustomDynamics::computePerDof(int numberOfAtoms, vector<RealVec>&
 }

 bool ReferenceCustomDynamics::evaluateCondition(int step) {
-    expressionSet.setVariable(uniformIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
-    expressionSet.setVariable(gaussianIndex, SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
+    uniform = SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber();
+    gaussian = SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
    double lhs = stepExpressions[step][0].evaluate();
    double rhs = stepExpressions[step][1].evaluate();
    switch (comparisons[step]) {
@@ -390,7 +391,7 @@ double ReferenceCustomDynamics::computeKineticEnergy(OpenMM::ContextImpl& contex
        energy = context.calcForcesAndEnergy(true, true, -1);
        forcesAreValid = true;
    }
-    computePerDof(numberOfAtoms, sumBuffer, atomCoordinates, velocities, forces, masses, perDof, kineticEnergyExpression, fIndex);
+    computePerDof(numberOfAtoms, sumBuffer, atomCoordinates, velocities, forces, masses, perDof, kineticEnergyExpression);
    RealOpenMM sum = 0.0;
    for (int j = 0; j < numberOfAtoms; j++)
        if (masses[j] != 0.0)

--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
@@ -26,6 +26,7 @@
 #include <sstream>
 #include <complex>
 #include <algorithm>
+#include <iostream>

 #include "SimTKOpenMMUtilities.h"
 #include "ReferenceLJCoulombIxn.h"
@@ -47,13 +48,13 @@ using namespace OpenMM;

   --------------------------------------------------------------------------------------- */

-ReferenceLJCoulombIxn::ReferenceLJCoulombIxn() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
+ReferenceLJCoulombIxn::ReferenceLJCoulombIxn() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), ljpme(false) {

-   // ---------------------------------------------------------------------------------------
+    // ---------------------------------------------------------------------------------------

-   // static const char* methodName = "\nReferenceLJCoulombIxn::ReferenceLJCoulombIxn";
+    // static const char* methodName = "\nReferenceLJCoulombIxn::ReferenceLJCoulombIxn";

-   // ---------------------------------------------------------------------------------------
+    // ---------------------------------------------------------------------------------------

 }

@@ -65,15 +66,15 @@ ReferenceLJCoulombIxn::ReferenceLJCoulombIxn() : cutoff(false), useSwitch(false)

 ReferenceLJCoulombIxn::~ReferenceLJCoulombIxn() {

-   // ---------------------------------------------------------------------------------------
+    // ---------------------------------------------------------------------------------------

-   // static const char* methodName = "\nReferenceLJCoulombIxn::~ReferenceLJCoulombIxn";
+    // static const char* methodName = "\nReferenceLJCoulombIxn::~ReferenceLJCoulombIxn";

-   // ---------------------------------------------------------------------------------------
+    // ---------------------------------------------------------------------------------------

 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use a cutoff.

@@ -83,14 +84,14 @@ ReferenceLJCoulombIxn::~ReferenceLJCoulombIxn() {

     --------------------------------------------------------------------------------------- */

-  void ReferenceLJCoulombIxn::setUseCutoff(RealOpenMM distance, const OpenMM::NeighborList& neighbors, RealOpenMM solventDielectric) {
+void ReferenceLJCoulombIxn::setUseCutoff(RealOpenMM distance, const OpenMM::NeighborList& neighbors, RealOpenMM solventDielectric) {

    cutoff = true;
    cutoffDistance = distance;
    neighborList = &neighbors;
    krf = pow(cutoffDistance, -3.0)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
    crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
-  }
+}

 /**---------------------------------------------------------------------------------------

@@ -105,7 +106,7 @@ void ReferenceLJCoulombIxn::setUseSwitchingFunction(RealOpenMM distance) {
    switchingDistance = distance;
 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use periodic boundary conditions.  This requires that a cutoff has
     also been set, and the smallest side of the periodic box is at least twice the cutoff
@@ -115,7 +116,7 @@ void ReferenceLJCoulombIxn::setUseSwitchingFunction(RealOpenMM distance) {

     --------------------------------------------------------------------------------------- */

-  void ReferenceLJCoulombIxn::setPeriodic(OpenMM::RealVec* vectors) {
+void ReferenceLJCoulombIxn::setPeriodic(OpenMM::RealVec* vectors) {

    assert(cutoff);
    assert(vectors[0][0] >= 2.0*cutoffDistance);
@@ -125,9 +126,9 @@ void ReferenceLJCoulombIxn::setUseSwitchingFunction(RealOpenMM distance) {
    periodicBoxVectors[0] = vectors[0];
    periodicBoxVectors[1] = vectors[1];
    periodicBoxVectors[2] = vectors[2];
-  }
+}

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use Ewald summation.

@@ -138,15 +139,15 @@ void ReferenceLJCoulombIxn::setUseSwitchingFunction(RealOpenMM distance) {

     --------------------------------------------------------------------------------------- */

-  void ReferenceLJCoulombIxn::setUseEwald(RealOpenMM alpha, int kmaxx, int kmaxy, int kmaxz) {
-      alphaEwald = alpha;
-      numRx = kmaxx;
-      numRy = kmaxy;
-      numRz = kmaxz;
-      ewald = true;
-  }
+void ReferenceLJCoulombIxn::setUseEwald(RealOpenMM alpha, int kmaxx, int kmaxy, int kmaxz) {
+    alphaEwald = alpha;
+    numRx = kmaxx;
+    numRy = kmaxy;
+    numRz = kmaxz;
+    ewald = true;
+}

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use Particle-Mesh Ewald (PME) summation.

@@ -155,13 +156,30 @@ void ReferenceLJCoulombIxn::setUseSwitchingFunction(RealOpenMM distance) {

     --------------------------------------------------------------------------------------- */

-  void ReferenceLJCoulombIxn::setUsePME(RealOpenMM alpha, int meshSize[3]) {
-      alphaEwald = alpha;
-      meshDim[0] = meshSize[0];
-      meshDim[1] = meshSize[1];
-      meshDim[2] = meshSize[2];
-      pme = true;
-  }
+void ReferenceLJCoulombIxn::setUsePME(RealOpenMM alpha, int meshSize[3]) {
+    alphaEwald = alpha;
+    meshDim[0] = meshSize[0];
+    meshDim[1] = meshSize[1];
+    meshDim[2] = meshSize[2];
+    pme = true;
+}
+
+/**---------------------------------------------------------------------------------------
+
+     Set the force to use Particle-Mesh Ewald (PME) summation for dispersion terms.
+
+     @param alpha  the dispersion Ewald separation parameter
+     @param gridSize the dimensions of the dispersion mesh
+
+     --------------------------------------------------------------------------------------- */
+
+void ReferenceLJCoulombIxn::setUseLJPME(RealOpenMM alpha, int meshSize[3]) {
+    alphaDispersionEwald = alpha;
+    dispersionMeshDim[0] = meshSize[0];
+    dispersionMeshDim[1] = meshSize[1];
+    dispersionMeshDim[2] = meshSize[2];
+    ljpme = true;
+}

 /**---------------------------------------------------------------------------------------

@@ -182,9 +200,9 @@ void ReferenceLJCoulombIxn::setUseSwitchingFunction(RealOpenMM distance) {
   --------------------------------------------------------------------------------------- */

 void ReferenceLJCoulombIxn::calculateEwaldIxn(int numberOfAtoms, vector<RealVec>& atomCoordinates,
-                                             RealOpenMM** atomParameters, vector<set<int> >& exclusions,
-                                             RealOpenMM* fixedParameters, vector<RealVec>& forces,
-                                             RealOpenMM* energyByAtom, RealOpenMM* totalEnergy, bool includeDirect, bool includeReciprocal) const {
+                                              RealOpenMM** atomParameters, vector<set<int> >& exclusions,
+                                              RealOpenMM* fixedParameters, vector<RealVec>& forces,
+                                              RealOpenMM* energyByAtom, RealOpenMM* totalEnergy, bool includeDirect, bool includeReciprocal) const {
    typedef std::complex<RealOpenMM> d_complex;

    static const RealOpenMM epsilon     =  1.0;
@@ -201,16 +219,27 @@ void ReferenceLJCoulombIxn::calculateEwaldIxn(int numberOfAtoms, vector<RealVec>
    RealOpenMM totalSelfEwaldEnergy     = 0.0;
    RealOpenMM realSpaceEwaldEnergy     = 0.0;
    RealOpenMM recipEnergy              = 0.0;
+    RealOpenMM recipDispersionEnergy    = 0.0;
    RealOpenMM totalRecipEnergy         = 0.0;
    RealOpenMM vdwEnergy                = 0.0;

-// **************************************************************************************
-// SELF ENERGY
-// **************************************************************************************
+    // A couple of sanity checks for
+    if(ljpme && useSwitch)
+        throw OpenMMException("Switching cannot be used with LJPME");
+    if(ljpme && !pme)
+        throw OpenMMException("LJPME has been set, without PME being set");
+
+    // **************************************************************************************
+    // SELF ENERGY
+    // **************************************************************************************

    if (includeReciprocal) {
        for (int atomID = 0; atomID < numberOfAtoms; atomID++) {
            RealOpenMM selfEwaldEnergy       = (RealOpenMM) (ONE_4PI_EPS0*atomParameters[atomID][QIndex]*atomParameters[atomID][QIndex] * alphaEwald/SQRT_PI);
+            if(ljpme) {
+                // Dispersion self term
+                selfEwaldEnergy -= pow(alphaDispersionEwald, 6.0) * 64.0*pow(atomParameters[atomID][SigIndex], 6.0) * pow(atomParameters[atomID][EpsIndex], 2.0) / 12.0;
+            }
            totalSelfEwaldEnergy            -= selfEwaldEnergy;
            if (energyByAtom) {
                energyByAtom[atomID]        -= selfEwaldEnergy;
@@ -222,194 +251,249 @@ void ReferenceLJCoulombIxn::calculateEwaldIxn(int numberOfAtoms, vector<RealVec>
        *totalEnergy += totalSelfEwaldEnergy;
    }

-// **************************************************************************************
-// RECIPROCAL SPACE EWALD ENERGY AND FORCES
-// **************************************************************************************
+    // **************************************************************************************
+    // RECIPROCAL SPACE EWALD ENERGY AND FORCES
+    // **************************************************************************************
    // PME

-  if (pme && includeReciprocal) {
-    pme_t          pmedata; /* abstract handle for PME data */
+    if (pme && includeReciprocal) {
+        pme_t          pmedata; /* abstract handle for PME data */

-    pme_init(&pmedata,alphaEwald,numberOfAtoms,meshDim,5,1);
+        pme_init(&pmedata,alphaEwald,numberOfAtoms,meshDim,5,1);

-    vector<RealOpenMM> charges(numberOfAtoms);
-    for (int i = 0; i < numberOfAtoms; i++)
-        charges[i] = atomParameters[i][QIndex];
-    pme_exec(pmedata,atomCoordinates,forces,charges,periodicBoxVectors,&recipEnergy);
+        vector<RealOpenMM> charges(numberOfAtoms);
+        for (int i = 0; i < numberOfAtoms; i++)
+            charges[i] = atomParameters[i][QIndex];
+        pme_exec(pmedata,atomCoordinates,forces,charges,periodicBoxVectors,&recipEnergy);

-    if (totalEnergy)
-       *totalEnergy += recipEnergy;
+        if (totalEnergy)
+            *totalEnergy += recipEnergy;

-    if (energyByAtom)
-        for (int n = 0; n < numberOfAtoms; n++)
-            energyByAtom[n] += recipEnergy;
+        if (energyByAtom)
+            for (int n = 0; n < numberOfAtoms; n++)
+                energyByAtom[n] += recipEnergy;

        pme_destroy(pmedata);
-  }

+        if (ljpme) {
+            // Dispersion reciprocal space terms
+            pme_init(&pmedata,alphaDispersionEwald,numberOfAtoms,dispersionMeshDim,5,1);
+
+            std::vector<RealVec> dpmeforces;
+            for (int i = 0; i < numberOfAtoms; i++){
+                charges[i] = 8.0*pow(atomParameters[i][SigIndex], 3.0) * atomParameters[i][EpsIndex];
+                dpmeforces.push_back(RealVec());
+            }
+            pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
+            for (int i = 0; i < numberOfAtoms; i++){
+                forces[i][0] -= 2.0*dpmeforces[i][0];
+                forces[i][1] -= 2.0*dpmeforces[i][1];
+                forces[i][2] -= 2.0*dpmeforces[i][2];
+            }
+            if (totalEnergy)
+                *totalEnergy += recipDispersionEnergy;
+
+            if (energyByAtom)
+                for (int n = 0; n < numberOfAtoms; n++)
+                    energyByAtom[n] += recipDispersionEnergy;
+            pme_destroy(pmedata);
+        }
+    }
    // Ewald method

-  else if (ewald && includeReciprocal) {
+    else if (ewald && includeReciprocal) {

-    // setup reciprocal box
+        // setup reciprocal box

-         RealOpenMM recipBoxSize[3] = { TWO_PI / periodicBoxVectors[0][0], TWO_PI / periodicBoxVectors[1][1], TWO_PI / periodicBoxVectors[2][2]};
+        RealOpenMM recipBoxSize[3] = { TWO_PI / periodicBoxVectors[0][0], TWO_PI / periodicBoxVectors[1][1], TWO_PI / periodicBoxVectors[2][2]};


-    // setup K-vectors
+        // setup K-vectors

-  #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
-  vector<d_complex> eir(kmax*numberOfAtoms*3);
-  vector<d_complex> tab_xy(numberOfAtoms);
-  vector<d_complex> tab_qxyz(numberOfAtoms);
+#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
+        vector<d_complex> eir(kmax*numberOfAtoms*3);
+        vector<d_complex> tab_xy(numberOfAtoms);
+        vector<d_complex> tab_qxyz(numberOfAtoms);

-  if (kmax < 1)
-      throw OpenMMException("kmax for Ewald summation < 1");
+        if (kmax < 1)
+            throw OpenMMException("kmax for Ewald summation < 1");

-  for (int i = 0; (i < numberOfAtoms); i++) {
-    for (int m = 0; (m < 3); m++)
-      EIR(0, i, m) = d_complex(1,0);
+        for (int i = 0; (i < numberOfAtoms); i++) {
+            for (int m = 0; (m < 3); m++)
+                EIR(0, i, m) = d_complex(1,0);

-    for (int m=0; (m<3); m++)
-      EIR(1, i, m) = d_complex(cos(atomCoordinates[i][m]*recipBoxSize[m]),
-                               sin(atomCoordinates[i][m]*recipBoxSize[m]));
+            for (int m=0; (m<3); m++)
+                EIR(1, i, m) = d_complex(cos(atomCoordinates[i][m]*recipBoxSize[m]),
+                                         sin(atomCoordinates[i][m]*recipBoxSize[m]));

-    for (int j=2; (j<kmax); j++)
-      for (int m=0; (m<3); m++)
-        EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
-  }
+            for (int j=2; (j<kmax); j++)
+                for (int m=0; (m<3); m++)
+                    EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
+        }

-    // calculate reciprocal space energy and forces
+        // calculate reciprocal space energy and forces

-    int lowry = 0;
-    int lowrz = 1;
+        int lowry = 0;
+        int lowrz = 1;

-    for (int rx = 0; rx < numRx; rx++) {
+        for (int rx = 0; rx < numRx; rx++) {

-      RealOpenMM kx = rx * recipBoxSize[0];
+            RealOpenMM kx = rx * recipBoxSize[0];

-      for (int ry = lowry; ry < numRy; ry++) {
+            for (int ry = lowry; ry < numRy; ry++) {

-        RealOpenMM ky = ry * recipBoxSize[1];
+                RealOpenMM ky = ry * recipBoxSize[1];

-        if (ry >= 0) {
-          for (int n = 0; n < numberOfAtoms; n++)
-            tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
-        }
+                if (ry >= 0) {
+                    for (int n = 0; n < numberOfAtoms; n++)
+                        tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
+                }

-        else {
-          for (int n = 0; n < numberOfAtoms; n++)
-            tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
-        }
+                else {
+                    for (int n = 0; n < numberOfAtoms; n++)
+                        tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
+                }

-        for (int rz = lowrz; rz < numRz; rz++) {
+                for (int rz = lowrz; rz < numRz; rz++) {

-          if (rz >= 0) {
-           for (int n = 0; n < numberOfAtoms; n++)
-             tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * EIR(rz, n, 2));
-          }
+                    if (rz >= 0) {
+                        for (int n = 0; n < numberOfAtoms; n++)
+                            tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * EIR(rz, n, 2));
+                    }

-          else {
-            for (int n = 0; n < numberOfAtoms; n++)
-              tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
-          }
+                    else {
+                        for (int n = 0; n < numberOfAtoms; n++)
+                            tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
+                    }

-          RealOpenMM cs = 0.0f;
-          RealOpenMM ss = 0.0f;
+                    RealOpenMM cs = 0.0f;
+                    RealOpenMM ss = 0.0f;

-          for (int n = 0; n < numberOfAtoms; n++) {
-            cs += tab_qxyz[n].real();
-            ss += tab_qxyz[n].imag();
-          }
+                    for (int n = 0; n < numberOfAtoms; n++) {
+                        cs += tab_qxyz[n].real();
+                        ss += tab_qxyz[n].imag();
+                    }

-          RealOpenMM kz = rz * recipBoxSize[2];
-          RealOpenMM k2 = kx * kx + ky * ky + kz * kz;
-          RealOpenMM ak = exp(k2*factorEwald) / k2;
+                    RealOpenMM kz = rz * recipBoxSize[2];
+                    RealOpenMM k2 = kx * kx + ky * ky + kz * kz;
+                    RealOpenMM ak = exp(k2*factorEwald) / k2;

-          for (int n = 0; n < numberOfAtoms; n++) {
-            RealOpenMM force = ak * (cs * tab_qxyz[n].imag() - ss * tab_qxyz[n].real());
-            forces[n][0] += 2 * recipCoeff * force * kx ;
-            forces[n][1] += 2 * recipCoeff * force * ky ;
-            forces[n][2] += 2 * recipCoeff * force * kz ;
-          }
+                    for (int n = 0; n < numberOfAtoms; n++) {
+                        RealOpenMM force = ak * (cs * tab_qxyz[n].imag() - ss * tab_qxyz[n].real());
+                        forces[n][0] += 2 * recipCoeff * force * kx ;
+                        forces[n][1] += 2 * recipCoeff * force * ky ;
+                        forces[n][2] += 2 * recipCoeff * force * kz ;
+                    }

-          recipEnergy       = recipCoeff * ak * (cs * cs + ss * ss);
-          totalRecipEnergy += recipEnergy;
+                    recipEnergy       = recipCoeff * ak * (cs * cs + ss * ss);
+                    totalRecipEnergy += recipEnergy;

-          if (totalEnergy)
-             *totalEnergy += recipEnergy;
+                    if (totalEnergy)
+                        *totalEnergy += recipEnergy;

-          if (energyByAtom)
-             for (int n = 0; n < numberOfAtoms; n++)
-               energyByAtom[n] += recipEnergy;
+                    if (energyByAtom)
+                        for (int n = 0; n < numberOfAtoms; n++)
+                            energyByAtom[n] += recipEnergy;

-          lowrz = 1 - numRz;
+                    lowrz = 1 - numRz;
+                }
+                lowry = 1 - numRy;
+            }
        }
-        lowry = 1 - numRy;
-      }
    }
-  }

-// **************************************************************************************
-// SHORT-RANGE ENERGY AND FORCES
-// **************************************************************************************
+    // **************************************************************************************
+    // SHORT-RANGE ENERGY AND FORCES
+    // **************************************************************************************

    if (!includeDirect)
        return;
    RealOpenMM totalVdwEnergy            = 0.0f;
    RealOpenMM totalRealSpaceEwaldEnergy = 0.0f;

+
    for (int i = 0; i < (int) neighborList->size(); i++) {
-       OpenMM::AtomPair pair = (*neighborList)[i];
-       int ii = pair.first;
-       int jj = pair.second;
-
-       RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
-       ReferenceForce::getDeltaRPeriodic(atomCoordinates[jj], atomCoordinates[ii], periodicBoxVectors, deltaR[0]);
-       RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
-       RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
-       RealOpenMM switchValue = 1, switchDeriv = 0;
-       if (useSwitch && r > switchingDistance) {
-           RealOpenMM t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
-           switchValue = 1+t*t*t*(-10+t*(15-t*6));
-           switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
-       }
-       RealOpenMM alphaR    = alphaEwald * r;
-
-
-       RealOpenMM dEdR      = (RealOpenMM) (ONE_4PI_EPS0 * atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR);
-                  dEdR      = (RealOpenMM) (dEdR * (erfc(alphaR) + 2 * alphaR * exp (- alphaR * alphaR) / SQRT_PI));
-
-       RealOpenMM sig       = atomParameters[ii][SigIndex] +  atomParameters[jj][SigIndex];
-       RealOpenMM sig2      = inverseR*sig;
-                  sig2     *= sig2;
-       RealOpenMM sig6      = sig2*sig2*sig2;
-       RealOpenMM eps       = atomParameters[ii][EpsIndex]*atomParameters[jj][EpsIndex];
-                  dEdR     += switchValue*eps*(twelve*sig6 - six)*sig6*inverseR*inverseR;
-       vdwEnergy = eps*(sig6-one)*sig6;
-       if (useSwitch) {
-           dEdR -= vdwEnergy*switchDeriv*inverseR;
-           vdwEnergy *= switchValue;
-       }
-
-       // accumulate forces
-
-       for (int kk = 0; kk < 3; kk++) {
-          RealOpenMM force  = dEdR*deltaR[0][kk];
-          forces[ii][kk]   += force;
-          forces[jj][kk]   -= force;
-       }
-
-       // accumulate energies
-
-       realSpaceEwaldEnergy        = (RealOpenMM) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR));
-
-       totalVdwEnergy             += vdwEnergy;
-       totalRealSpaceEwaldEnergy  += realSpaceEwaldEnergy;
+        OpenMM::AtomPair pair = (*neighborList)[i];
+        int ii = pair.first;
+        int jj = pair.second;
+
+        RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
+        ReferenceForce::getDeltaRPeriodic(atomCoordinates[jj], atomCoordinates[ii], periodicBoxVectors, deltaR[0]);
+        RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
+        RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+        RealOpenMM switchValue = 1, switchDeriv = 0;
+        if (useSwitch && r > switchingDistance) {
+            RealOpenMM t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
+            switchValue = 1+t*t*t*(-10+t*(15-t*6));
+            switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
+        }
+        RealOpenMM alphaR    = alphaEwald * r;
+
+
+        RealOpenMM dEdR      = (RealOpenMM) (ONE_4PI_EPS0 * atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR);
+        dEdR      = (RealOpenMM) (dEdR * (erfc(alphaR) + 2 * alphaR * exp (- alphaR * alphaR) / SQRT_PI));
+
+        RealOpenMM sig       = atomParameters[ii][SigIndex] +  atomParameters[jj][SigIndex];
+        RealOpenMM sig2      = inverseR*sig;
+        sig2     *= sig2;
+        RealOpenMM sig6      = sig2*sig2*sig2;
+        RealOpenMM eps       = atomParameters[ii][EpsIndex]*atomParameters[jj][EpsIndex];
+        dEdR     += switchValue*eps*(twelve*sig6 - six)*sig6*inverseR*inverseR;
+        vdwEnergy = eps*(sig6-one)*sig6;
+
+        if (ljpme) {
+            RealOpenMM dalphaR   = alphaDispersionEwald * r;
+            RealOpenMM dar2 = dalphaR*dalphaR;
+            RealOpenMM dar4 = dar2*dar2;
+            RealOpenMM dar6 = dar4*dar2;
+            RealOpenMM inverseR2 = inverseR*inverseR;
+            RealOpenMM c6i = 8.0*pow(atomParameters[ii][SigIndex], 3.0) * atomParameters[ii][EpsIndex];
+            RealOpenMM c6j = 8.0*pow(atomParameters[jj][SigIndex], 3.0) * atomParameters[jj][EpsIndex];
+            // For the energies and forces, we first add the regular Lorentz−Berthelot terms.  The C12 term is treated as usual
+            // but we then subtract out (remembering that the C6 term is negative) the multiplicative C6 term that has been
+            // computed in real space.  Finally, we add a potential shift term to account for the difference between the LB
+            // and multiplicative functional forms at the cutoff.
+            RealOpenMM emult = c6i*c6j*inverseR2*inverseR2*inverseR2*(1.0 - EXP(-dar2) * (1.0 + dar2 + 0.5*dar4));
+            dEdR += 6.0*c6i*c6j*inverseR2*inverseR2*inverseR2*inverseR2*(1.0 - EXP(-dar2) * (1.0 + dar2 + 0.5*dar4 + dar6/6.0));
+
+            RealOpenMM inverseCut2 = 1.0/(cutoffDistance*cutoffDistance);
+            RealOpenMM inverseCut6 = inverseCut2*inverseCut2*inverseCut2;
+            sig2 = atomParameters[ii][SigIndex] +  atomParameters[jj][SigIndex];
+            sig2 *= sig2;
+            sig6 = sig2*sig2*sig2;
+            // The additive part of the potential shift
+            RealOpenMM potentialshift = eps*(one-sig6*inverseCut6)*sig6*inverseCut6;
+            dalphaR   = alphaDispersionEwald * cutoffDistance;
+            dar2 = dalphaR*dalphaR;
+            dar4 = dar2*dar2;
+            // The multiplicative part of the potential shift
+            potentialshift -= c6i*c6j*inverseCut6*(1.0 - EXP(-dar2) * (1.0 + dar2 + 0.5*dar4));
+            vdwEnergy += emult + potentialshift;
+        }
+
+        if (useSwitch) {
+            dEdR -= vdwEnergy*switchDeriv*inverseR;
+            vdwEnergy *= switchValue;
+        }
+
+        // accumulate forces
+
+        for (int kk = 0; kk < 3; kk++) {
+            RealOpenMM force  = dEdR*deltaR[0][kk];
+            forces[ii][kk]   += force;
+            forces[jj][kk]   -= force;
+        }
+
+        // accumulate energies
+
+        realSpaceEwaldEnergy        = (RealOpenMM) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR));
+
+        totalVdwEnergy             += vdwEnergy;
+        totalRealSpaceEwaldEnergy  += realSpaceEwaldEnergy;

        if (energyByAtom) {
-           energyByAtom[ii] += realSpaceEwaldEnergy + vdwEnergy;
-           energyByAtom[jj] += realSpaceEwaldEnergy + vdwEnergy;
+            energyByAtom[ii] += realSpaceEwaldEnergy + vdwEnergy;
+            energyByAtom[jj] += realSpaceEwaldEnergy + vdwEnergy;
        }

    }
@@ -424,39 +508,57 @@ void ReferenceLJCoulombIxn::calculateEwaldIxn(int numberOfAtoms, vector<RealVec>
    for (int i = 0; i < numberOfAtoms; i++)
        for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
            if (*iter > i) {
-               int ii = i;
-               int jj = *iter;
-
-               RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
-               ReferenceForce::getDeltaR(atomCoordinates[jj], atomCoordinates[ii], deltaR[0]);
-               RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
-               RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
-               RealOpenMM alphaR    = alphaEwald * r;
-               if (erf(alphaR) > 1e-6) {
-                   RealOpenMM dEdR      = (RealOpenMM) (ONE_4PI_EPS0 * atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR);
-                              dEdR      = (RealOpenMM) (dEdR * (erf(alphaR) - 2 * alphaR * exp (- alphaR * alphaR) / SQRT_PI));
-
-                   // accumulate forces
-
-                   for (int kk = 0; kk < 3; kk++) {
-                      RealOpenMM force  = dEdR*deltaR[0][kk];
-                      forces[ii][kk]   -= force;
-                      forces[jj][kk]   += force;
-                   }
-
-                   // accumulate energies
-
-                   realSpaceEwaldEnergy = (RealOpenMM) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erf(alphaR));
-               }
-               else {
-                   realSpaceEwaldEnergy = (RealOpenMM) (alphaEwald*TWO_OVER_SQRT_PI*ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]);
-               }
-
-               totalExclusionEnergy += realSpaceEwaldEnergy;
-               if (energyByAtom) {
-                   energyByAtom[ii] -= realSpaceEwaldEnergy;
-                   energyByAtom[jj] -= realSpaceEwaldEnergy;
-               }
+                int ii = i;
+                int jj = *iter;
+
+                RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
+                ReferenceForce::getDeltaR(atomCoordinates[jj], atomCoordinates[ii], deltaR[0]);
+                RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
+                RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+                RealOpenMM alphaR    = alphaEwald * r;
+                if (erf(alphaR) > 1e-6) {
+                    RealOpenMM dEdR      = (RealOpenMM) (ONE_4PI_EPS0 * atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR);
+                    dEdR      = (RealOpenMM) (dEdR * (erf(alphaR) - 2 * alphaR * exp (- alphaR * alphaR) / SQRT_PI));
+
+                    // accumulate forces
+
+                    for (int kk = 0; kk < 3; kk++) {
+                        RealOpenMM force  = dEdR*deltaR[0][kk];
+                        forces[ii][kk]   -= force;
+                        forces[jj][kk]   += force;
+                    }
+
+                    // accumulate energies
+
+                    realSpaceEwaldEnergy = (RealOpenMM) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erf(alphaR));
+                }
+                else {
+                    realSpaceEwaldEnergy = (RealOpenMM) (alphaEwald*TWO_OVER_SQRT_PI*ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]);
+                }
+
+                if(ljpme){
+                    // Dispersion terms.  Here we just back out the reciprocal space terms, and don't add any extra real space terms.
+                    RealOpenMM dalphaR   = alphaDispersionEwald * r;
+                    RealOpenMM inverseR2 = inverseR*inverseR;
+                    RealOpenMM dar2 = dalphaR*dalphaR;
+                    RealOpenMM dar4 = dar2*dar2;
+                    RealOpenMM dar6 = dar4*dar2;
+                    RealOpenMM c6i = 8.0*pow(atomParameters[ii][SigIndex], 3.0) * atomParameters[ii][EpsIndex];
+                    RealOpenMM c6j = 8.0*pow(atomParameters[jj][SigIndex], 3.0) * atomParameters[jj][EpsIndex];
+                    realSpaceEwaldEnergy -= c6i*c6j*inverseR2*inverseR2*inverseR2*(1.0 - EXP(-dar2) * (1.0 + dar2 + 0.5*dar4));
+                    RealOpenMM dEdR = -6.0*c6i*c6j*inverseR2*inverseR2*inverseR2*inverseR2*(1.0 - EXP(-dar2) * (1.0 + dar2 + 0.5*dar4 + dar6/6.0));
+                    for (int kk = 0; kk < 3; kk++) {
+                        RealOpenMM force  = dEdR*deltaR[0][kk];
+                        forces[ii][kk]   -= force;
+                        forces[jj][kk]   += force;
+                    }
+                }
+
+                totalExclusionEnergy += realSpaceEwaldEnergy;
+                if (energyByAtom) {
+                    energyByAtom[ii] -= realSpaceEwaldEnergy;
+                    energyByAtom[jj] -= realSpaceEwaldEnergy;
+                }
            }
        }

@@ -488,31 +590,31 @@ void ReferenceLJCoulombIxn::calculatePairIxn(int numberOfAtoms, vector<RealVec>&
                                             RealOpenMM* fixedParameters, vector<RealVec>& forces,
                                             RealOpenMM* energyByAtom, RealOpenMM* totalEnergy, bool includeDirect, bool includeReciprocal) const {

-   if (ewald || pme) {
-       calculateEwaldIxn(numberOfAtoms, atomCoordinates, atomParameters, exclusions, fixedParameters, forces, energyByAtom,
-               totalEnergy, includeDirect, includeReciprocal);
-       return;
-   }
-   if (!includeDirect)
-       return;
-   if (cutoff) {
-       for (int i = 0; i < (int) neighborList->size(); i++) {
-           OpenMM::AtomPair pair = (*neighborList)[i];
-           calculateOneIxn(pair.first, pair.second, atomCoordinates, atomParameters, forces, energyByAtom, totalEnergy);
-       }
-   }
-   else {
-       for (int ii = 0; ii < numberOfAtoms; ii++) {
-          // loop over atom pairs
-
-          for (int jj = ii+1; jj < numberOfAtoms; jj++)
-              if (exclusions[jj].find(ii) == exclusions[jj].end())
-                  calculateOneIxn(ii, jj, atomCoordinates, atomParameters, forces, energyByAtom, totalEnergy);
-       }
-   }
+    if (ewald || pme || ljpme) {
+        calculateEwaldIxn(numberOfAtoms, atomCoordinates, atomParameters, exclusions, fixedParameters, forces, energyByAtom,
+                          totalEnergy, includeDirect, includeReciprocal);
+        return;
+    }
+    if (!includeDirect)
+        return;
+    if (cutoff) {
+        for (int i = 0; i < (int) neighborList->size(); i++) {
+            OpenMM::AtomPair pair = (*neighborList)[i];
+            calculateOneIxn(pair.first, pair.second, atomCoordinates, atomParameters, forces, energyByAtom, totalEnergy);
+        }
+    }
+    else {
+        for (int ii = 0; ii < numberOfAtoms; ii++) {
+            // loop over atom pairs
+
+            for (int jj = ii+1; jj < numberOfAtoms; jj++)
+                if (exclusions[jj].find(ii) == exclusions[jj].end())
+                    calculateOneIxn(ii, jj, atomCoordinates, atomParameters, forces, energyByAtom, totalEnergy);
+        }
+    }
 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Calculate LJ Coulomb pair ixn between two atoms

@@ -527,8 +629,8 @@ void ReferenceLJCoulombIxn::calculatePairIxn(int numberOfAtoms, vector<RealVec>&
     --------------------------------------------------------------------------------------- */

 void ReferenceLJCoulombIxn::calculateOneIxn(int ii, int jj, vector<RealVec>& atomCoordinates,
-                        RealOpenMM** atomParameters, vector<RealVec>& forces,
-                        RealOpenMM* energyByAtom, RealOpenMM* totalEnergy) const {
+                                            RealOpenMM** atomParameters, vector<RealVec>& forces,
+                                            RealOpenMM* energyByAtom, RealOpenMM* totalEnergy) const {

    // ---------------------------------------------------------------------------------------

@@ -572,7 +674,7 @@ void ReferenceLJCoulombIxn::calculateOneIxn(int ii, int jj, vector<RealVec>& ato
    }
    RealOpenMM sig       = atomParameters[ii][SigIndex] +  atomParameters[jj][SigIndex];
    RealOpenMM sig2      = inverseR*sig;
-               sig2     *= sig2;
+    sig2     *= sig2;
    RealOpenMM sig6      = sig2*sig2*sig2;

    RealOpenMM eps       = atomParameters[ii][EpsIndex]*atomParameters[jj][EpsIndex];
@@ -595,18 +697,18 @@ void ReferenceLJCoulombIxn::calculateOneIxn(int ii, int jj, vector<RealVec>& ato
    // accumulate forces

    for (int kk = 0; kk < 3; kk++) {
-       RealOpenMM force  = dEdR*deltaR[0][kk];
-       forces[ii][kk]   += force;
-       forces[jj][kk]   -= force;
+        RealOpenMM force  = dEdR*deltaR[0][kk];
+        forces[ii][kk]   += force;
+        forces[jj][kk]   -= force;
    }

    // accumulate energies

    if (totalEnergy)
-       *totalEnergy += energy;
+        *totalEnergy += energy;
    if (energyByAtom) {
-       energyByAtom[ii] += energy;
-       energyByAtom[jj] += energy;
+        energyByAtom[ii] += energy;
+        energyByAtom[jj] += energy;
    }
-  }
+}

--- a/platforms/reference/src/SimTKReference/ReferencePME.cpp
+++ b/platforms/reference/src/SimTKReference/ReferencePME.cpp
@@ -513,6 +513,106 @@ pme_reciprocal_convolution(pme_t     pme,
 }


+static void
+dpme_reciprocal_convolution(pme_t     pme,
+                           const RealVec periodicBoxVectors[3],
+                           const RealVec recipBoxVectors[3],
+                           RealOpenMM *  energy)
+{
+    int kx,ky,kz;
+    int nx,ny,nz;
+    RealOpenMM mx,my,mz;
+    RealOpenMM mhx,mhy,mhz,m2;
+    RealOpenMM bx,by,bz;
+    RealOpenMM d1,d2;
+    RealOpenMM eterm,struct2,ets2;
+    RealOpenMM esum;
+    RealOpenMM denom;
+    RealOpenMM boxfactor;
+    RealOpenMM maxkx,maxky,maxkz;
+
+    t_complex *ptr;
+
+    nx = pme->ngrid[0];
+    ny = pme->ngrid[1];
+    nz = pme->ngrid[2];
+
+    boxfactor = (RealOpenMM) M_PI*sqrt(M_PI) / (6.0*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
+
+    esum = 0;
+
+    maxkx = (RealOpenMM) ((nx+1)/2);
+    maxky = (RealOpenMM) ((ny+1)/2);
+    maxkz = (RealOpenMM) ((nz+1)/2);
+
+    RealOpenMM bfac = M_PI / pme->ewaldcoeff;
+    RealOpenMM fac1 = 2.0*M_PI*M_PI*M_PI*sqrt(M_PI);
+    RealOpenMM fac2 = pme->ewaldcoeff*pme->ewaldcoeff*pme->ewaldcoeff;
+    RealOpenMM fac3 = -2.0*pme->ewaldcoeff*M_PI*M_PI;
+    RealOpenMM b, m, m3, expfac, expterm, erfcterm;
+
+    for (kx=0;kx<nx;kx++)
+    {
+        /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+        mx  = (RealOpenMM) ((kx<maxkx) ? kx : (kx-nx));
+        mhx = mx*recipBoxVectors[0][0];
+        bx  = pme->bsplines_moduli[0][kx];
+
+        for (ky=0;ky<ny;ky++)
+        {
+            /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+            my  = (RealOpenMM) ((ky<maxky) ? ky : (ky-ny));
+            mhy = mx*recipBoxVectors[1][0]+my*recipBoxVectors[1][1];
+            by  = pme->bsplines_moduli[1][ky];
+
+            for (kz=0;kz<nz;kz++)
+            {
+                /*
+                 * Unlike the Coulombic case, there's an m=0 term so all terms are considered here.
+                 */
+
+                /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+                mz        = (RealOpenMM) ((kz<maxkz) ? kz : (kz-nz));
+                mhz       = mx*recipBoxVectors[2][0]+my*recipBoxVectors[2][1]+mz*recipBoxVectors[2][2];
+
+                /* Pointer to the grid cell in question */
+                ptr       = pme->grid + kx*ny*nz + ky*nz + kz;
+
+                /* Get grid data for this frequency */
+                d1        = ptr->re;
+                d2        = ptr->im;
+
+                /* Calculate the convolution - see the Essman/Darden paper for the equation! */
+                m2        = mhx*mhx+mhy*mhy+mhz*mhz;
+                bz        = pme->bsplines_moduli[2][kz];
+                denom     = boxfactor / (bx*by*bz);
+
+                m = sqrt(m2);
+                m3 = m*m2;
+                b = bfac*m;
+                expfac = -b*b;
+                erfcterm = erfc(b);
+                expterm = exp(expfac);
+
+                eterm     = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+
+                /* write back convolution data to grid */
+                ptr->re   = d1*eterm;
+                ptr->im   = d2*eterm;
+
+                struct2   = (d1*d1+d2*d2);
+
+                /* Long-range PME contribution to the energy for this frequency */
+                ets2      = eterm*struct2;
+                esum     += ets2;
+            }
+        }
+    }
+    // Remember the C6 energy is attractive, hence the negative sign.
+    *energy = (RealOpenMM) (-esum);
+}
+
+
 static void
 pme_grid_interpolate_force(pme_t pme,
                           const RealVec recipBoxVectors[3],
@@ -704,6 +804,49 @@ int pme_exec(pme_t       pme,
 }


+int pme_exec_dpme(pme_t       pme,
+             const vector<RealVec>& atomCoordinates,
+             vector<RealVec>& forces,
+             const vector<RealOpenMM>& c6s,
+             const RealVec periodicBoxVectors[3],
+             RealOpenMM* energy)
+{
+    /* Routine is called with coordinates in x, a box, and charges in q */
+
+    RealVec recipBoxVectors[3];
+    invert_box_vectors(periodicBoxVectors, recipBoxVectors);
+
+    /* Before we can do the actual interpolation, we need to recalculate and update
+     * the indices for each particle in the charge grid (initialized in pme_init()),
+     * and what its fractional offset in this grid cell is.
+     */
+
+    /* Update charge grid indices and fractional offsets for each atom.
+     * The indices/fractions are stored internally in the pme datatype
+     */
+    pme_update_grid_index_and_fraction(pme,atomCoordinates,periodicBoxVectors,recipBoxVectors);
+
+    /* Calculate bsplines (and their differentials) from current fractional coordinates, store in pme structure */
+    pme_update_bsplines(pme);
+
+    /* Spread the charges on grid (using newly calculated bsplines in the pme structure) */
+    pme_grid_spread_charge(pme, c6s);
+
+    /* do 3d-fft */
+    fftpack_exec_3d(pme->fftplan,FFTPACK_FORWARD,pme->grid,pme->grid);
+
+    /* solve in k-space */
+    dpme_reciprocal_convolution(pme,periodicBoxVectors,recipBoxVectors,energy);
+
+    /* do 3d-invfft */
+    fftpack_exec_3d(pme->fftplan,FFTPACK_BACKWARD,pme->grid,pme->grid);
+
+    /* Get the particle forces from the grid and bsplines in the pme structure */
+    pme_grid_interpolate_force(pme,recipBoxVectors,c6s,forces);
+
+    return 0;
+}
+

 int
 pme_destroy(pme_t    pme)

--- a/platforms/reference/src/SimTKReference/ReferenceStochasticDynamics.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceStochasticDynamics.cpp

-/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -41,20 +41,15 @@ using namespace OpenMM;

   @param numberOfAtoms  number of atoms
   @param deltaT         delta t for dynamics
-   @param tau            viscosity(?)
+   @param friction       friction coefficient
   @param temperature    temperature

   --------------------------------------------------------------------------------------- */

 ReferenceStochasticDynamics::ReferenceStochasticDynamics(int numberOfAtoms,
-                                                         RealOpenMM deltaT, RealOpenMM tau,
+                                                         RealOpenMM deltaT, RealOpenMM friction,
                                                         RealOpenMM temperature) : 
-           ReferenceDynamics(numberOfAtoms, deltaT, temperature), _tau(tau) {
-   if (tau <= 0) {
-      std::stringstream message;
-      message << "illegal tau value: " << tau;
-      throw OpenMMException(message.str());
-   }
+           ReferenceDynamics(numberOfAtoms, deltaT, temperature), friction(friction) {
   xPrime.resize(numberOfAtoms);
   inverseMasses.resize(numberOfAtoms);
 }
@@ -77,21 +72,12 @@ ReferenceStochasticDynamics::~ReferenceStochasticDynamics() {

 /**---------------------------------------------------------------------------------------

-   Get tau
-
-   @return tau
+   Get friction coefficient

   --------------------------------------------------------------------------------------- */

-RealOpenMM ReferenceStochasticDynamics::getTau() const {
-
-   // ---------------------------------------------------------------------------------------
-
-   // static const char* methodName  = "\nReferenceStochasticDynamics::getTau";
-
-   // ---------------------------------------------------------------------------------------
-
-   return _tau;
+RealOpenMM ReferenceStochasticDynamics::getFriction() const {
+   return friction;
 }

 /**---------------------------------------------------------------------------------------
@@ -120,11 +106,12 @@ void ReferenceStochasticDynamics::updatePart1(int numberOfAtoms, vector<RealVec>

   // perform first update

-   RealOpenMM tau = getTau();
-   const RealOpenMM vscale = EXP(-getDeltaT()/tau);
-   const RealOpenMM fscale = (1-vscale)*tau;
+   RealOpenMM dt = getDeltaT();
+   RealOpenMM friction = getFriction();
+   const RealOpenMM vscale = EXP(-dt*friction);
+   const RealOpenMM fscale = (friction == 0 ? dt : (1-vscale)/friction);
   const RealOpenMM kT = BOLTZ*getTemperature();
-   const RealOpenMM noisescale = SQRT(2*kT/tau)*SQRT(0.5*(1-vscale*vscale)*tau);
+   const RealOpenMM noisescale = SQRT(kT*(1-vscale*vscale));

   for (int ii = 0; ii < numberOfAtoms; ii++) {
       if (inverseMasses[ii] != 0.0) {

--- a/platforms/reference/src/SimTKReference/ReferenceVariableStochasticDynamics.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceVariableStochasticDynamics.cpp

-/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -42,21 +42,16 @@ using namespace OpenMM;

   @param numberOfAtoms  number of atoms
   @param deltaT         delta t for dynamics
-   @param tau            viscosity(?)
+   @param friction       friction coefficient
   @param temperature    temperature
   @param accuracy       required accuracy

   --------------------------------------------------------------------------------------- */

 ReferenceVariableStochasticDynamics::ReferenceVariableStochasticDynamics(int numberOfAtoms,
-                                                          RealOpenMM tau, RealOpenMM temperature,
+                                                          RealOpenMM friction, RealOpenMM temperature,
                                                          RealOpenMM accuracy) :
-           ReferenceDynamics(numberOfAtoms, 0.0f, temperature), _tau(tau), _accuracy(accuracy) {
-   if (tau <= 0) {
-      std::stringstream message;
-      message << "illegal tau value: " << tau;
-      throw OpenMMException(message.str());
-   }
+           ReferenceDynamics(numberOfAtoms, 0.0f, temperature), friction(friction), _accuracy(accuracy) {
   xPrime.resize(numberOfAtoms);
   inverseMasses.resize(numberOfAtoms);
 }
@@ -101,21 +96,12 @@ void ReferenceVariableStochasticDynamics::setAccuracy(RealOpenMM accuracy) {

 /**---------------------------------------------------------------------------------------

-   Get tau
-
-   @return tau
+   Get friction coefficient

   --------------------------------------------------------------------------------------- */

-RealOpenMM ReferenceVariableStochasticDynamics::getTau() const {
-
-   // ---------------------------------------------------------------------------------------
-
-   // static const char* methodName  = "\nReferenceVariableStochasticDynamics::getTau";
-
-   // ---------------------------------------------------------------------------------------
-
-   return _tau;
+RealOpenMM ReferenceVariableStochasticDynamics::getFriction() const {
+   return friction;
 }

 /**---------------------------------------------------------------------------------------
@@ -178,11 +164,12 @@ void ReferenceVariableStochasticDynamics::updatePart1(int numberOfAtoms, vector<
 
    // perform first update

-   RealOpenMM tau = getTau();
-   const RealOpenMM vscale = EXP(-getDeltaT()/tau);
-   const RealOpenMM fscale = (1-vscale)*tau;
+   RealOpenMM dt = getDeltaT();
+   RealOpenMM friction = getFriction();
+   const RealOpenMM vscale = EXP(-dt*friction);
+   const RealOpenMM fscale = (friction == 0 ? dt : (1-vscale)/friction);
   const RealOpenMM kT = BOLTZ*getTemperature();
-   const RealOpenMM noisescale = SQRT(2*kT/tau)*SQRT(0.5*(1-vscale*vscale)*tau);
+   const RealOpenMM noisescale = SQRT(kT*(1-vscale*vscale));

   for (int ii = 0; ii < numberOfAtoms; ii++) {
       if (masses[ii] != 0) {
@@ -266,11 +253,11 @@ void ReferenceVariableStochasticDynamics::update(const OpenMM::System& system, v

   // copy xPrime -> atomCoordinates

+   RealOpenMM invStepSize = 1.0/getDeltaT();
   for (int ii = 0; ii < numberOfAtoms; ii++) {
       if (masses[ii] != 0.0) {
-           atomCoordinates[ii][0] = xPrime[ii][0];
-           atomCoordinates[ii][1] = xPrime[ii][1];
-           atomCoordinates[ii][2] = xPrime[ii][2];
+           velocities[ii] = (xPrime[ii]-atomCoordinates[ii])*invStepSize;
+           atomCoordinates[ii] = xPrime[ii];
       }
   }


--- a/platforms/reference/tests/TestReferenceMonteCarloMembraneBarostat.cpp
+++ b/platforms/reference/tests/TestReferenceMonteCarloMembraneBarostat.cpp
@@ -37,6 +37,7 @@
 #include "openmm/MonteCarloMembraneBarostat.h"
 #include "openmm/Context.h"
 #include "ReferencePlatform.h"
+#include "openmm/HarmonicBondForce.h"
 #include "openmm/NonbondedForce.h"
 #include "openmm/System.h"
 #include "openmm/LangevinIntegrator.h"
@@ -76,8 +77,9 @@ void testIdealGas(MonteCarloMembraneBarostat::XYMode xymode, MonteCarloMembraneB
    }
    MonteCarloMembraneBarostat* barostat = new MonteCarloMembraneBarostat(pressure, tension, temp[0], xymode, zmode, frequency);
    system.addForce(barostat);
-    ASSERT(barostat->usesPeriodicBoundaryConditions());
-    ASSERT(system.usesPeriodicBoundaryConditions());
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    bonds->setUsesPeriodicBoundaryConditions(true);
+    system.addForce(bonds); // So it won't complain the system is non-periodic.

    // Test it for three different temperatures.

@@ -134,8 +136,6 @@ void testRandomSeed() {
    system.addForce(forceField);
    MonteCarloMembraneBarostat* barostat = new MonteCarloMembraneBarostat(pressure, tension, temp, MonteCarloMembraneBarostat::XYAnisotropic, MonteCarloMembraneBarostat::ZFree, 1);
    system.addForce(barostat);
-    ASSERT(barostat->usesPeriodicBoundaryConditions());
-    ASSERT(system.usesPeriodicBoundaryConditions());
    vector<Vec3> positions(numParticles);
    vector<Vec3> velocities(numParticles);
    for (int i = 0; i < numParticles; ++i) {

--- a/plugins/amoeba/openmmapi/src/AmoebaAngleForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaAngleForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaAngleForceImpl::getKernelNames() {

 void AmoebaAngleForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaAngleForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaBondForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaBondForceImpl.cpp
@@ -75,4 +75,5 @@ vector<pair<int, int> > AmoebaBondForceImpl::getBondedParticles() const {

 void AmoebaBondForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaBondForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaGeneralizedKirkwoodForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaGeneralizedKirkwoodForceImpl.cpp
@@ -66,4 +66,5 @@ std::vector<std::string> AmoebaGeneralizedKirkwoodForceImpl::getKernelNames() {

 void AmoebaGeneralizedKirkwoodForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaGeneralizedKirkwoodForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaInPlaneAngleForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaInPlaneAngleForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaInPlaneAngleForceImpl::getKernelNames() {

 void AmoebaInPlaneAngleForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaInPlaneAngleForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForceImpl.cpp
@@ -50,7 +50,8 @@ AmoebaMultipoleForceImpl::~AmoebaMultipoleForceImpl() {
 void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {

    const System& system = context.getSystem();
-    if (owner.getNumMultipoles() != system.getNumParticles())
+    int numParticles = system.getNumParticles();
+    if (owner.getNumMultipoles() != numParticles)
        throw OpenMMException("AmoebaMultipoleForce must have exactly as many particles as the System it belongs to.");

    // check cutoff < 0.5*boxSize
@@ -64,7 +65,7 @@ void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {
    }

    double quadrupoleValidationTolerance = 1.0e-05;
-    for (int ii = 0; ii < system.getNumParticles(); ii++) {
+    for (int ii = 0; ii < numParticles; ii++) {

        int axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY;
        double charge, thole, dampingFactor, polarity ;
@@ -121,6 +122,23 @@ void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {
             buffer << "] (ZThenX, Bisector, Z-Bisect, ThreeFold, NoAxisType) currently handled .";
             throw OpenMMException(buffer.str());
        }
+        if (axisType != AmoebaMultipoleForce::NoAxisType && (multipoleAtomZ < 0 || multipoleAtomZ >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "AmoebaMultipoleForce: invalid z axis particle: " << multipoleAtomZ;
+            throw OpenMMException(buffer.str());
+        }
+        if (axisType != AmoebaMultipoleForce::NoAxisType && axisType != AmoebaMultipoleForce::ZOnly &&
+                (multipoleAtomX < 0 || multipoleAtomX >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "AmoebaMultipoleForce: invalid x axis particle: " << multipoleAtomX;
+            throw OpenMMException(buffer.str());
+        }
+        if ((axisType == AmoebaMultipoleForce::ZBisect || axisType == AmoebaMultipoleForce::ThreeFold) &&
+                (multipoleAtomY < 0 || multipoleAtomY >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "AmoebaMultipoleForce: invalid y axis particle: " << multipoleAtomY;
+            throw OpenMMException(buffer.str());
+        }
    }
    kernel = context.getPlatform().createKernel(CalcAmoebaMultipoleForceKernel::Name(), context);
    kernel.getAs<CalcAmoebaMultipoleForceKernel>().initialize(context.getSystem(), owner);
@@ -206,6 +224,7 @@ void AmoebaMultipoleForceImpl::getSystemMultipoleMoments(ContextImpl& context, s

 void AmoebaMultipoleForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaMultipoleForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }

 void AmoebaMultipoleForceImpl::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {

--- a/plugins/amoeba/openmmapi/src/AmoebaOutOfPlaneBendForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaOutOfPlaneBendForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaOutOfPlaneBendForceImpl::getKernelNames() {

 void AmoebaOutOfPlaneBendForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaOutOfPlaneBendForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaPiTorsionForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaPiTorsionForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaPiTorsionForceImpl::getKernelNames() {

 void AmoebaPiTorsionForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaPiTorsionForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaStretchBendForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaStretchBendForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaStretchBendForceImpl::getKernelNames() {

 void AmoebaStretchBendForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaStretchBendForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/platforms/cuda/CMakeLists.txt
+++ b/plugins/amoeba/platforms/cuda/CMakeLists.txt
@@ -21,6 +21,7 @@ SET(OPENMM_SOURCE_SUBDIRS .)
 SET(OPENMMAMOEBACUDA_LIBRARY_NAME OpenMMAmoebaCUDA)

 SET(SHARED_TARGET ${OPENMMAMOEBACUDA_LIBRARY_NAME})
+SET(STATIC_TARGET ${OPENMMAMOEBACUDA_LIBRARY_NAME}_static)


 # These are all the places to search for header files which are
@@ -85,17 +86,42 @@ ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
    DEPENDS ${CUDA_KERNELS}
 )
 SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
-ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${SHARED_AMOEBA_TARGET})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY")
-IF (APPLE)
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
-ELSE (APPLE)
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
-ENDIF (APPLE)
+# Build the shared plugin library.
+
+IF (OPENMM_BUILD_SHARED_LIB)
+    ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${SHARED_AMOEBA_TARGET})
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY")
+    IF (APPLE)
+        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
+    ELSE (APPLE)
+        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
+    ENDIF (APPLE)
+
+    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
+ENDIF (OPENMM_BUILD_SHARED_LIB)
+
+# Build the static plugin library.
+
+IF(OPENMM_BUILD_STATIC_LIB)
+    ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${STATIC_AMOEBA_TARGET})
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_STATIC_LIBRARY")
+    IF (APPLE)
+        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
+    ELSE (APPLE)
+        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
+    ENDIF (APPLE)
+
+    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
+ENDIF(OPENMM_BUILD_STATIC_LIB)

 INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)
 # Ensure that links to the main CUDA library will be resolved.

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
 * Authors: Mark Friedrichs, Peter Eastman                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,10 +33,18 @@

 using namespace OpenMM;

+#ifdef OPENMM_BUILDING_STATIC_LIBRARY
+static void registerPlatforms() {
+#else
 extern "C" OPENMM_EXPORT void registerPlatforms() {
+#endif
 }

+#ifdef OPENMM_BUILDING_STATIC_LIBRARY
+static void registerKernelFactories() {
+#else
 extern "C" OPENMM_EXPORT void registerKernelFactories() {
+#endif
    try {
        Platform& platform = Platform::getPlatformByName("CUDA");
        AmoebaCudaKernelFactory* factory = new AmoebaCudaKernelFactory();
@@ -105,4 +113,4 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
        return new CudaCalcAmoebaWcaDispersionForceKernel(name, platform, cu, context.getSystem());

    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
-}
+}
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -41,7 +41,7 @@
 #include "CudaForceInfo.h"
 #include "CudaKernelSources.h"
 #include "CudaNonbondedUtilities.h"
-#include "jama_svd.h"
+#include "jama_lu.h"

 #include <algorithm>
 #include <cmath>
@@ -52,10 +52,10 @@
 using namespace OpenMM;
 using namespace std;

-#define CHECK_RESULT(result) \
+#define CHECK_RESULT(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
-        m<<errorMessage<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }

@@ -813,7 +813,7 @@ private:
 };

 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
-        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false),
+        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), hasCreatedEvent(false),
        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL),
        fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
        diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
        inducedDipoleFieldGradientGk(NULL), inducedDipoleFieldGradientGkPolar(NULL), extrapolatedDipoleFieldGradient(NULL), extrapolatedDipoleFieldGradientPolar(NULL),
        extrapolatedDipoleFieldGradientGk(NULL), extrapolatedDipoleFieldGradientGkPolar(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
-        pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), pmeAtomGridIndex(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
+        pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
 }

 CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
@@ -927,14 +927,14 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete pmePhidp;
    if (pmeCphi != NULL)
        delete pmeCphi;
-    if (pmeAtomGridIndex != NULL)
-        delete pmeAtomGridIndex;
    if (lastPositions != NULL)
        delete lastPositions;
    if (sort != NULL)
        delete sort;
    if (hasInitializedFFT)
        cufftDestroy(fft);
+    if (hasCreatedEvent)
+        cuEventDestroy(syncEvent);
 }

 void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
@@ -1021,6 +1021,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
        diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
        diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
+        CHECK_RESULT(cuEventCreate(&syncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for AmoebaMultipoleForce");
+        hasCreatedEvent = true;
    }
    else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
        int numOrders = force.getExtrapolationCoefficients().size();
@@ -1153,7 +1155,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            NonbondedForce nb;
            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
            nb.setCutoffDistance(force.getCutoffDistance());
-            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ);
+            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
            gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
            gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
            gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
@@ -1212,6 +1214,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
        recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS");
        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
+        solveMatrixKernel = cu.getKernel(module, "solveDIISMatrix");
        initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
        iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
        computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
@@ -1253,7 +1256,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
            pmeDefines["EXTRAPOLATED_POLARIZATION"] = "";
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
-        pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
        pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
        pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates");
        pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
@@ -1285,7 +1287,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        pmePhidp = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhidp");
        pmeCphi = new CudaArray(cu, 10*numMultipoles, elementSize, "pmeCphi");
        pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-        pmeAtomGridIndex = CudaArray::create<int2>(cu, numMultipoles, "pmeAtomGridIndex");
        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
        cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C);
        if (result != CUFFT_SUCCESS)
@@ -1569,16 +1570,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Reciprocal space calculation.
        
        unsigned int maxTiles = nb.getInteractingTiles().getSize();
-        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
-            cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
-            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
-        cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
-        sort->sort(*pmeAtomGridIndex);
        void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
            &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms());
        void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(),
-            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),  cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
        void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
@@ -1590,7 +1586,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
        void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
            &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
-        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        else
@@ -1598,7 +1594,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(),
            &fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
            cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
-            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
+            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
        void* pmeTransformFixedPotentialArgs[] = {&pmePhi->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms());
@@ -1625,7 +1621,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in

        cu.clearBuffer(*pmeGrid);
        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
-            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
        if (cu.getUseDoublePrecision())
@@ -1634,15 +1630,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
        else
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
-        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        else
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
            &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
-            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2],
-            &pmeAtomGridIndex->getDevicePointer()};
+            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
        
        // Iterate until the dipoles converge.
@@ -1771,7 +1766,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
        cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
        cu.clearBuffer(*pmeGrid);
        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
-            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
        if (cu.getUseDoublePrecision()) {
@@ -1784,15 +1779,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
        void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
            &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
-        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        else
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
            &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
-            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2],
-            &pmeAtomGridIndex->getDevicePointer()};
+            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
@@ -1831,22 +1825,24 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
    float2* errors = (float2*) cu.getPinnedBuffer();
    inducedDipoleErrors->download(errors, false);
+    cuEventRecord(syncEvent, cu.getCurrentStream());
    
    // Build the DIIS matrix.
    
    int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles);
    void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
    int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
-    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*128, 128, 128*elementSize);
-    vector<float> matrixf;
-    vector<double> matrix;
-    if (cu.getUseDoublePrecision())
-        diisMatrix->download(matrix);
-    else
-        diisMatrix->download(matrixf);
+    int blockSize = 512;
+    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*blockSize, blockSize, blockSize*elementSize);
+    
+    // Solve the matrix.
+
+    void* solveMatrixArgs[] = {&iteration, &diisMatrix->getDevicePointer(), &diisCoefficients->getDevicePointer()};
+    cu.executeKernel(solveMatrixKernel, solveMatrixArgs, 32, 32);
    
    // Determine whether the iteration has converged.
    
+    cuEventSynchronize(syncEvent);
    double total1 = 0.0, total2 = 0.0;
    for (int j = 0; j < inducedDipoleErrors->getSize(); j++) {
        total1 += errors[j].x;
@@ -1854,56 +1850,16 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    }
    if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
        return true;
-
-    // Compute the coefficients for selecting the new dipoles.
-
-    float* coefficients = (float*) cu.getPinnedBuffer();
-    if (iteration == 0)
-        coefficients[0] = 1;
-    else {
-        int rank = numPrev+1;
-        Array2D<double> b(rank, rank);
-        b[0][0] = 0;
-        for (int i = 1; i < rank; i++)
-            b[i][0] = b[0][i] = -1;
-        if (cu.getUseDoublePrecision()) {
-            for (int i = 0; i < numPrev; i++)
-                for (int j = 0; j < numPrev; j++)
-                    b[i+1][j+1] = matrix[i*MaxPrevDIISDipoles+j];
-        }
-        else {
-            for (int i = 0; i < numPrev; i++)
-                for (int j = 0; j < numPrev; j++)
-                    b[i+1][j+1] = matrixf[i*MaxPrevDIISDipoles+j];
-        }
-
-        // Solve using SVD.  Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.
-
-        JAMA::SVD<double> svd(b);
-        Array2D<double> u, v;
-        svd.getU(u);
-        svd.getV(v);
-        Array1D<double> s;
-        svd.getSingularValues(s);
-        int effectiveRank = svd.rank();
-        for (int i = 1; i < rank; i++) {
-            double d = 0;
-            for (int j = 0; j < effectiveRank; j++)
-                d -= u[0][j]*v[i][j]/s[j];
-            coefficients[i-1] = d;
-        }
-    }
-    diisCoefficients->upload(coefficients, false);
    
    // Compute the dipoles.
    
    void* updateInducedFieldArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
        &prevDipoles->getDevicePointer(), &prevDipolesPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
-    cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+    cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, 3*cu.getNumAtoms(), 256);
    if (gkKernel != NULL) {
        void* updateInducedFieldGkArgs[] = {&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
            &prevDipolesGk->getDevicePointer(), &prevDipolesGkPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
-        cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+        cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, 3*cu.getNumAtoms(), 256);
    }
    return false;
 }

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -408,7 +408,7 @@ private:
    int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
    double alpha, inducedEpsilon;
-    bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid;
+    bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid, hasCreatedEvent;
    AmoebaMultipoleForce::PolarizationType polarizationType;
    CudaContext& cu;
    const System& system;
@@ -465,16 +465,16 @@ private:
    CudaArray* pmePhidp;
    CudaArray* pmeCphi;
    CudaArray* pmeAtomRange;
-    CudaArray* pmeAtomGridIndex;
    CudaArray* lastPositions;
    CudaSort* sort;
    cufftHandle fft;
    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
-    CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
+    CUfunction pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
-    CUfunction recordDIISDipolesKernel, buildMatrixKernel;
+    CUfunction recordDIISDipolesKernel, buildMatrixKernel, solveMatrixKernel;
    CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
    CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
+    CUevent syncEvent;
    CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
    static const int PmeOrder = 5;
    static const int MaxPrevDIISDipoles = 20;