Merge branch 'master' into amber-switching

Conflicts: wrappers/python/simtk/openmm/app/amberprmtopfile.py In fixing the merge conflict, I went ahead and fixed up the switchDistance logic to match what I did in CharmmPsfFile.

Merge branch 'master' into amber-switching
Conflicts: wrappers/python/simtk/openmm/app/amberprmtopfile.py In fixing the merge conflict, I went ahead and fixed up the switchDistance logic to match what I did in CharmmPsfFile.
1c938ceb · Jason Swails · a1113e7b · 167ae8a0 · 1c938ceb · a1113e7b
Commit 1c938ceb authored Nov 20, 2015 by Jason Swails
20 changed files
--- a/openmmapi/src/CustomNonbondedForceImpl.cpp
+++ b/openmmapi/src/CustomNonbondedForceImpl.cpp
@@ -111,6 +111,26 @@ void CustomNonbondedForceImpl::initialize(ContextImpl& context) {
        if (cutoff > 0.5*boxVectors[0][0] || cutoff > 0.5*boxVectors[1][1] || cutoff > 0.5*boxVectors[2][2])
            throw OpenMMException("CustomNonbondedForce: The cutoff distance cannot be greater than half the periodic box size.");
    }
+    // Check that all interaction groups only specify particles that have been defined.
+    for (int group = 0; group < owner.getNumInteractionGroups(); group++) {
+        set<int> set1, set2;
+        owner.getInteractionGroupParameters(group, set1, set2);
+        for (set<int>::iterator it = set1.begin(); it != set1.end(); ++it)
+            if ((*it < 0) || (*it >= owner.getNumParticles())) {
+                stringstream msg;
+                msg << "CustomNonbondedForce: Interaction group " << group << " set1 contains a particle index (" << *it << ") "
+                    << "not present in system (" << owner.getNumParticles() << " particles).";
+                throw OpenMMException(msg.str());
+            }
+        for (set<int>::iterator it = set2.begin(); it != set2.end(); ++it)
+            if ((*it < 0) || (*it >= owner.getNumParticles())) {
+                stringstream msg;
+                msg << "CustomNonbondedForce: Interaction group " << group << " set2 contains a particle index (" << *it << ") "
+                    << "not present in system (" << owner.getNumParticles() << " particles).";
+                throw OpenMMException(msg.str());
+            }
+    }
    kernel.getAs<CalcCustomNonbondedForceKernel>().initialize(context.getSystem(), owner);
 }

--- a/openmmapi/src/GBVIForceImpl.cpp
+++ b/openmmapi/src/GBVIForceImpl.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-#include "openmm/internal/GBVIForceImpl.h"
-#include "openmm/internal/ContextImpl.h"
-#include "openmm/OpenMMException.h"
-#include "openmm/kernels.h"
-#include <vector>
-#include <cmath>
-#include <cstdio>
-#include <sstream>
-using namespace OpenMM;
-using std::vector;
-GBVIForceImpl::GBVIForceImpl(const GBVIForce& owner) : owner(owner) {
-}
-void GBVIForceImpl::initialize(ContextImpl& context) {
-    kernel = context.getPlatform().createKernel(CalcGBVIForceKernel::Name(), context);
-    if (owner.getNumParticles() != context.getSystem().getNumParticles())
-        throw OpenMMException("GBVIForce must have exactly as many particles as the System it belongs to.");
-    const System& system      = context.getSystem();
-    int     numberOfParticles = owner.getNumParticles();
-    int numberOfBonds         = owner.getNumBonds();
-    // load 1-2 atom pairs along w/ bond distance using HarmonicBondForce & constraints
-    // numberOfBonds < 1, indicating they were not set by the user
-    if( numberOfBonds < 1 && numberOfParticles > 1 ){
-        (void) fprintf( stderr, "Warning: no covalent bonds set for GB/VI force!\n" ); 
-//        getBondsFromForces( context );
-//        numberOfBonds = owner.getNumBonds();
-    }
-    std::vector< std::vector<int> > bondIndices;
-    bondIndices.resize( numberOfBonds );
-    std::vector<double> bondLengths;
-    bondLengths.resize( numberOfBonds );
-    for (int i = 0; i < numberOfBonds; i++) {
-        int particle1, particle2;
-        double bondLength;
-        owner.getBondParameters(i, particle1, particle2, bondLength);
-        if (particle1 < 0 || particle1 >= owner.getNumParticles()) {
-            std::stringstream msg;
-            msg << "GBVISoftcoreForce: Illegal particle index: ";
-            msg << particle1;
-            throw OpenMMException(msg.str());
-        }
-        if (particle2 < 0 || particle2 >= owner.getNumParticles()) {
-            std::stringstream msg;
-            msg << "GBVISoftcoreForce: Illegal particle index: ";
-            msg << particle2;
-            throw OpenMMException(msg.str());
-        }
-        if (bondLength < 0 ) { 
-            std::stringstream msg;
-            msg << "GBVISoftcoreForce: negative bondlength: ";
-            msg << bondLength;
-            throw OpenMMException(msg.str());
-        }
-        bondIndices[i].push_back( particle1 );  
-        bondIndices[i].push_back( particle2 );
-        bondLengths[i] = bondLength;
-    }   
-    if (owner.getNonbondedMethod() == GBVIForce::CutoffPeriodic) {
-        Vec3 boxVectors[3];
-        system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
-        double cutoff = owner.getCutoffDistance();
-        if (cutoff > 0.5*boxVectors[0][0] || cutoff > 0.5*boxVectors[1][1] || cutoff > 0.5*boxVectors[2][2])
-            throw OpenMMException("GBVIForce: The cutoff distance cannot be greater than half the periodic box size.");
-    }
-    vector<double> scaledRadii;
-    scaledRadii.resize(numberOfParticles);
-    findScaledRadii( numberOfParticles, bondIndices, bondLengths, scaledRadii);
-    kernel.getAs<CalcGBVIForceKernel>().initialize(context.getSystem(), owner, scaledRadii);
-}
-/*
-int GBVIForceImpl::getBondsFromForces(ContextImpl& context) {
-    // load 1-2 atom pairs along w/ bond distance using HarmonicBondForce & constraints
-    const System& system = context.getSystem();
-    for (int i = 0; i < system.getNumForces(); i++) {
-        if (dynamic_cast<const HarmonicBondForce*>(&system.getForce(i)) != NULL) {
-            const HarmonicBondForce& force = dynamic_cast<const HarmonicBondForce&>(system.getForce(i));
-            for (int j = 0; j < force.getNumBonds(); ++j) {
-                int particle1, particle2;
-                double length, k;
-                force.getBondParameters(j, particle1, particle2, length, k); 
-                owner.addBond( particle1, particle2, length );
-            }
-            break;
-        }
-    }   
-    // Also treat constrained distances as bonds if mass of one particle is < (2 + epsilon) (~2=deuterium) 
-    for (int j = 0; j < system.getNumConstraints(); j++) {
-        int particle1, particle2;
-        double distance;
-        system.getConstraintParameters(j, particle1, particle2, distance);
-        double mass1 = system.getParticleMass( particle1 );
-        double mass2 = system.getParticleMass( particle2 );
-        if( mass1 < 2.1 || mass2 < 2.1 ){
-            owner.addBond( particle1, particle2, distance );
-        }
-    }   
-    return 0;
-}
-*/
-void GBVIForceImpl::findScaledRadii( int numberOfParticles, const std::vector<std::vector<int> >& bondIndices,
-                                     const std::vector<double> & bondLengths, std::vector<double> & scaledRadii) const {
-    // load 1-2 indicies for each atom 
-    std::vector<std::vector<int> > bonded12(numberOfParticles);
-    for (int i = 0; i < (int) bondIndices.size(); ++i) {
-        bonded12[bondIndices[i][0]].push_back(i);
-        bonded12[bondIndices[i][1]].push_back(i);
-    }
-    int errors = 0;
-    // compute scaled radii (Eq. 5 of Labute paper [JCC 29 p. 1693-1698 2008])
-    for (int j = 0; j < (int) bonded12.size(); ++j){
-        double charge;
-        double gamma;
-        double radiusJ;
-        double scaledRadiusJ;
-        owner.getParticleParameters(j, charge, radiusJ, gamma); 
-        if(  bonded12[j].size() == 0 && numberOfParticles > 1 ){
-            (void) fprintf( stderr, "Warning GBVIForceImpl::findScaledRadii atom %d has no covalent bonds; using atomic radius=%.3f.\n", j, radiusJ );
-            scaledRadiusJ = radiusJ;
-//             errors++;
-        } else {
-            double rJ2    = radiusJ*radiusJ;
-            // loop over bonded neighbors of atom j, applying Eq. 5 in Labute
-            scaledRadiusJ = 0.0;
-            for (int i = 0; i < (int) bonded12[j].size(); ++i){
-               int index            = bonded12[j][i];
-               int bondedAtomIndex  = (j == bondIndices[index][0]) ? bondIndices[index][1] : bondIndices[index][0];
-               double radiusI;
-               owner.getParticleParameters(bondedAtomIndex, charge, radiusI, gamma); 
-               double rI2           = radiusI*radiusI;
-               double a_ij          = (radiusI - bondLengths[index]);
-                      a_ij         *= a_ij;
-                      a_ij          = (rJ2 - a_ij)/(2.0*bondLengths[index]);
-               double a_ji          = radiusJ - bondLengths[index];
-                      a_ji         *= a_ji;
-                      a_ji          = (rI2 - a_ji)/(2.0*bondLengths[index]);
-               scaledRadiusJ       += a_ij*a_ij*(3.0*radiusI - a_ij) + a_ji*a_ji*( 3.0*radiusJ - a_ji );
-            }
-            scaledRadiusJ  = (radiusJ*radiusJ*radiusJ) - 0.125*scaledRadiusJ; 
-            if( scaledRadiusJ > 0.0 ){
-                scaledRadiusJ  = 0.95*pow( scaledRadiusJ, (1.0/3.0) );
-            } else {
-                scaledRadiusJ  = 0.0;
-            }
-        }
-        scaledRadii[j] = scaledRadiusJ;
-    }
-    // abort if errors
-    if( errors ){
-        throw OpenMMException("GBVIForceImpl::findScaledRadii errors -- aborting");
-    }
-}
-double GBVIForceImpl::calcForcesAndEnergy(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
-    if ((groups&(1<<owner.getForceGroup())) != 0)
-        return kernel.getAs<CalcGBVIForceKernel>().execute(context, includeForces, includeEnergy);
-    return 0.0;
-}
-std::vector<std::string> GBVIForceImpl::getKernelNames() {
-    std::vector<std::string> names;
-    names.push_back(CalcGBVIForceKernel::Name());
-    return names;
-}
--- a/openmmapi/src/Integrator.cpp
+++ b/openmmapi/src/Integrator.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -47,3 +47,19 @@ Integrator::~Integrator() {
        context->integratorDeleted();
    }
 }
+double Integrator::getStepSize() const {
+    return stepSize;
+}
+void Integrator::setStepSize(double size) {
+    stepSize = size;
+}
+double Integrator::getConstraintTolerance() const {
+    return constraintTol;
+}
+void Integrator::setConstraintTolerance(double tol) {
+    constraintTol = tol;
+}
--- a/platforms/cpu/CMakeLists.txt
+++ b/platforms/cpu/CMakeLists.txt
@@ -12,9 +12,11 @@
 #   libOpenMMCPU_static.a
 #----------------------------------------------------
-IF(BUILD_TESTING)
+SET(OPENMM_BUILD_CPU_TESTS TRUE CACHE BOOL "Whether to build CPU platform test cases")
+MARK_AS_ADVANCED(OPENMM_BUILD_CPU_TESTS)
+IF(BUILD_TESTING AND OPENMM_BUILD_CPU_TESTS)
    SUBDIRS(tests)
-ENDIF(BUILD_TESTING)
+ENDIF(BUILD_TESTING AND OPENMM_BUILD_CPU_TESTS)
 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -113,12 +113,10 @@ public:
    }
    /**
-     * Find the index of the first particle in voxel (y,z) whose x coordinate in >= the specified value.
+     * Find the index of the first particle in voxel (y,z) whose x coordinate is >= the specified value.
     */
-    int findLowerBound(int y, int z, double x) const {
+    int findLowerBound(int y, int z, double x, int lower, int upper) const {
        const vector<pair<float, int> >& bin = bins[y][z];
-        int lower = 0;
-        int upper = bin.size();
        while (lower < upper) {
            int middle = (lower+upper)/2;
            if (bin[middle].first < x)
@@ -130,12 +128,10 @@ public:
    }
    /**
-     * Find the index of the first particle in voxel (y,z) whose x coordinate in greater than the specified value.
+     * Find the index of the first particle in voxel (y,z) whose x coordinate is greater than the specified value.
     */
-    int findUpperBound(int y, int z, double x) const {
+    int findUpperBound(int y, int z, double x, int lower, int upper) const {
        const vector<pair<float, int> >& bin = bins[y][z];
-        int lower = 0;
-        int upper = bin.size();
        while (lower < upper) {
            int middle = (lower+upper)/2;
            if (bin[middle].first > x)
@@ -211,7 +207,7 @@ public:
            // Loop over voxels along the y axis.
-            int boxz = (int) floor((float) z/nz);
+            float boxz = floor((float) z/nz);
            int starty = centerVoxelIndex.y-dIndexY;
            int endy = centerVoxelIndex.y+dIndexY;
            float yoffset = (float) (usePeriodic ? boxz*periodicBoxVectors[2][1] : 0);
@@ -228,7 +224,7 @@ public:
                voxelIndex.y = y;
                if (usePeriodic)
                    voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y));
-                int boxy = (int) floor((float) y/ny);
+                float boxy = floor((float) y/ny);
                float xoffset = (float) (usePeriodic ? boxy*periodicBoxVectors[1][0]+boxz*periodicBoxVectors[2][0] : 0);
                // Identify the range of atoms within this bin we need to search.  When using periodic boundary
@@ -264,22 +260,25 @@ public:
                int numRanges;
                int rangeStart[2];
                int rangeEnd[2];
-                rangeStart[0] = findLowerBound(voxelIndex.y, voxelIndex.z, minx);
+                int binSize = bins[voxelIndex.y][voxelIndex.z].size();
+                rangeStart[0] = findLowerBound(voxelIndex.y, voxelIndex.z, minx, 0, binSize);
                if (needPeriodic) {
                    numRanges = 2;
-                    rangeEnd[0] = findUpperBound(voxelIndex.y, voxelIndex.z, maxx);
+                    rangeEnd[0] = findUpperBound(voxelIndex.y, voxelIndex.z, maxx, rangeStart[0], binSize);
-                    if (rangeStart[0] > 0) {
+                    if (rangeStart[0] > 0 && rangeEnd[0] < binSize)
+                        numRanges = 1;
+                    else if (rangeStart[0] > 0) {
                        rangeStart[1] = 0;
-                        rangeEnd[1] = min(findUpperBound(voxelIndex.y, voxelIndex.z, maxx-periodicBoxSize[0]), rangeStart[0]);
+                        rangeEnd[1] = min(findUpperBound(voxelIndex.y, voxelIndex.z, maxx-periodicBoxSize[0], 0, rangeStart[0]), rangeStart[0]);
                    }
                    else {
-                        rangeStart[1] = max(findLowerBound(voxelIndex.y, voxelIndex.z, minx+periodicBoxSize[0]), rangeEnd[0]);
+                        rangeStart[1] = max(findLowerBound(voxelIndex.y, voxelIndex.z, minx+periodicBoxSize[0], rangeEnd[0], binSize), rangeEnd[0]);
                        rangeEnd[1] = bins[voxelIndex.y][voxelIndex.z].size();
                    }
                }
                else {
                    numRanges = 1;
-                    rangeEnd[0] = findUpperBound(voxelIndex.y, voxelIndex.z, maxx);
+                    rangeEnd[0] = findUpperBound(voxelIndex.y, voxelIndex.z, maxx, rangeStart[0], binSize);
                }
                bool periodicRectangular = (needPeriodic && !triclinic);

--- a/serialization/include/openmm/serialization/GBVIForceProxy.h
+++ b/serialization/include/openmm/serialization/GBVIForceProxy.h
-#ifndef OPENMM_GBVIFORCE_PROXY_H_
-#define OPENMM_GBVIFORCE_PROXY_H_
 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
@@ -9,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,22 +29,8 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
-#include "openmm/internal/windowsExport.h"
+#include "CpuTests.h"
-#include "openmm/serialization/SerializationProxy.h"
+#include "TestCompoundIntegrator.h"
-namespace OpenMM {
-/**
- * This is a proxy for serializing GBVIForce objects.
- */
-class OPENMM_EXPORT GBVIForceProxy : public SerializationProxy {
-public:
-    GBVIForceProxy();
-    void serialize(const void* object, SerializationNode& node) const;
-    void* deserialize(const SerializationNode& node) const;
-};
-} // namespace OpenMM
-#endif /*OPENMM_GBVIFORCE_PROXY_H_*/
+void runPlatformTests() {
+}
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -30,6 +30,7 @@
 #include <map>
 #include <queue>
 #include <string>
+#include <utility>
 #define __CL_ENABLE_EXCEPTIONS
 #ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
@@ -538,6 +539,11 @@ public:
     */
    void invalidateMolecules();
 private:
+    /**
+     * Compute a sorted list of device indices in decreasing order of desirability
+     */
+    std::vector<int> getDevicePrecedence();
    struct Molecule;
    struct MoleculeGroup;
    class VirtualSiteInfo;

--- a/platforms/cuda/include/CudaIntegrationUtilities.h
+++ b/platforms/cuda/include/CudaIntegrationUtilities.h
@@ -62,6 +62,14 @@ public:
    CudaArray& getStepSize() {
        return *stepSize;
    }
+    /**
+     * Set the size to use for the next step.
+     */
+    void setNextStepSize(double size);
+    /**
+     * Get the size that was used for the last step.
+     */
+    double getLastStepSize();
    /**
     * Apply constraints to the atom positions.
     *
@@ -154,6 +162,7 @@ private:
    CudaArray* vsiteLocalCoordsParams;
    int randomPos;
    int lastSeed, numVsites;
+    double2 lastStepSize;
    struct ShakeCluster;
    struct ConstraintOrderer;
 };

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -657,6 +657,7 @@ private:
    CudaArray* pmeBsplineModuliZ;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
+    CudaArray* pmeEnergyBuffer;
    CudaSort* sort;
    Kernel cpuPme;
    PmeIO* pmeio;
@@ -1123,7 +1124,6 @@ public:
    double computeKineticEnergy(ContextImpl& context, const VerletIntegrator& integrator);
 private:
    CudaContext& cu;
-    double prevStepSize;
    CUfunction kernel1, kernel2;
 };
@@ -1354,7 +1354,7 @@ private:
    void recordChangedParameters(ContextImpl& context);
    bool evaluateCondition(int step);
    CudaContext& cu;
-    double prevStepSize, energy;
+    double energy;
    float energyFloat;
    int numGlobalVariables;
    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints;

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -121,7 +121,7 @@ public:
    ContextImpl* context;
    std::vector<CudaContext*> contexts;
    std::vector<double> contextEnergy;
-    bool removeCM, peerAccessSupported, useCpuPme;
+    bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    if (deviceIndex < -1 || deviceIndex >= numDevices)
        throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
+    vector<int> devicePrecedence;
    if (deviceIndex == -1) {
-        // Try to figure out which device is the fastest.
+        devicePrecedence = getDevicePrecedence();
+    } else {
-        int bestSpeed = -1;
+        devicePrecedence.push_back(deviceIndex);
-        int bestCompute = -1;
+    }
-        for (int i = 0; i < numDevices; i++) {
-            CHECK_RESULT(cuDeviceGet(&device, i));
+    this->deviceIndex = -1;
-            int major, minor, clock, multiprocessors;
+    for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
-            CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
+        int trialDeviceIndex = devicePrecedence[i];
-            if (major == 1 && minor < 2)
+        CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
-                continue; // 1.0 and 1.1 are not supported
+        defaultOptimizationOptions = "--use_fast_math";
-            CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
+        unsigned int flags = CU_CTX_MAP_HOST;
-            CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+        if (useBlockingSync)
-            int speed = clock*multiprocessors;
+            flags += CU_CTX_SCHED_BLOCKING_SYNC;
-            if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
+        else
-                deviceIndex = i;
+            flags += CU_CTX_SCHED_SPIN;
-                bestSpeed = speed;
-                bestCompute = major;
+        if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
-            }
+            this->deviceIndex = trialDeviceIndex;
+            break;
        }
    }
-    if (deviceIndex == -1)
+    if (this->deviceIndex == -1)
-        throw OpenMMException("No compatible CUDA device is available");
+        if (deviceIndex != -1)
-    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
+            throw OpenMMException("The requested CUDA device could not be loaded");
-    this->deviceIndex = deviceIndex;
+        else
+            throw OpenMMException("No compatible CUDA device is available");
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-    // This is a workaround to support GTX 980 with CUDA 6.5.  It reports its compute capability
+#if __CUDA_API_VERSION < 7000
-    // as 5.2, but the compiler doesn't support anything beyond 5.0.  We can remove this once
+        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
-    // CUDA 7.0 is released.
+        // its compute capability as 5.2, but the compiler doesn't support
-    if (major == 5)
+        // anything beyond 5.0.
-        minor = 0;
+        if (major == 5)
+            minor = 0;
+#endif
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;
-    if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
-        throw OpenMMException("This device does not support double precision");
-    defaultOptimizationOptions = "--use_fast_math";
-    unsigned int flags = CU_CTX_MAP_HOST;
-    if (useBlockingSync)
-        flags += CU_CTX_SCHED_BLOCKING_SYNC;
-    else
-        flags += CU_CTX_SCHED_SPIN;
-    CHECK_RESULT(cuCtxCreate(&context, flags, device));
    contextIsValid = true;
    CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
    if (contextIndex > 0) {
@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
    compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
    compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
    // Set defines for applying periodic boundary conditions.
    Vec3 boxVectors[3];
    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    }
    // Create the work thread used for parallelization when running on multiple devices.
    thread = new WorkThread();
    // Create utilities objects.
    bonded = new CudaBondedUtilities(*this);
    nonbonded = new CudaNonbondedUtilities(*this);
    integration = new CudaIntegrationUtilities(*this, system);
@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
            if (index != result.npos) {
                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
                    // We have found a complete symbol, not part of a longer symbol.
                    result.replace(index, size, iter->second);
                    index += iter->second.size();
                }
@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
        return -1;
    }
    WaitForSingleObject(pi.hProcess, INFINITE);
-    DWORD exitCode = -1;  
+    DWORD exitCode = -1;
    if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
        throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
    } else {
-        if(exitCode == 0) 
+        if(exitCode == 0)
            return 0;
        else
            return -1;
@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    if (!defines.empty())
        src << endl;
    src << source << endl;
    // See whether we already have PTX for this kernel cached.
    CSHA1 sha1;
    sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
    sha1.Final();
@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    CUmodule module;
    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
        return module;
    // Select names for the various temporary files.
    stringstream tempFileName;
    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
 #ifdef WIN32
@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    int res = 0;
    // If the runtime compiler plugin is available, use it.
    if (hasCompilerKernel) {
        string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
        // If possible, write the PTX out to a temporary file so we can cache it for later use.
        bool wroteCache = false;
        try {
            ofstream out(outputFile.c_str());
@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        }
        if (!wroteCache) {
            // An error occurred.  Possibly we don't have permission to write to the temp directory.  Just try to load the module directly.
            CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
            return module;
        }
@@ -883,7 +882,7 @@ private:
 void CudaContext::findMoleculeGroups() {
    // The first time this is called, we need to identify all the molecules in the system.
    if (moleculeGroups.size() == 0) {
        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
                        identical = false;
            }
            // See if the constraints are identical.
            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
    }
    if (valid)
        return;
    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
    // again.
    vector<int4> newCellOffsets(numAtoms);
    if (useDoublePrecision) {
        vector<double4> oldPosq(paddedNumAtoms);
@@ -1196,6 +1195,8 @@ void CudaContext::reorderAtomsImpl() {
            molPos[i].x *= invNumAtoms;
            molPos[i].y *= invNumAtoms;
            molPos[i].z *= invNumAtoms;
+            if (molPos[i].x != molPos[i].x)
+                throw OpenMMException("Particle coordinate is nan");
        }
        if (nonbonded->getUsePeriodic()) {
            // Move each molecule position into the same box.
@@ -1391,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
       pthread_cond_wait(&queueEmptyCondition, &queueLock);
    pthread_mutex_unlock(&queueLock);
 }
+vector<int> CudaContext::getDevicePrecedence() {
+    int numDevices;
+    CUdevice thisDevice;
+    string errorMessage = "Error initializing Context";
+    vector<pair<pair<int, int>, int> > devices;
+    CHECK_RESULT(cuDeviceGetCount(&numDevices));
+    for (int i = 0; i < numDevices; i++) {
+        CHECK_RESULT(cuDeviceGet(&thisDevice, i));
+        int major, minor, clock, multiprocessors, speed;
+        CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
+        if (major == 1 && minor < 2)
+            continue;
+        if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
+            continue;
+        CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
+        CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
+        speed = clock*multiprocessors;
+        pair<int, int> deviceProperties = std::make_pair(major, speed);
+        devices.push_back(std::make_pair(deviceProperties, -i));
+    }
+    // sort first by compute capability (higher is better), then speed
+    // (higher is better), and finally device index (lower is better)
+    std::sort(devices.begin(), devices.end());
+    std::reverse(devices.begin(), devices.end());
+    vector<int> precedence;
+    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
+        precedence.push_back(-devices[i].second);
+    }
+    return precedence;
+}
--- a/platforms/cuda/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda/src/CudaExpressionUtilities.cpp
@@ -109,7 +109,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                }
                out << ");\n";
                out << "APPLY_PERIODIC_TO_DELTA(periodicDistance_delta)\n";
-                out << tempType << " periodicDistance_rinv = RSQRT(periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z);\n";
+                out << tempType << " periodicDistance_r2 = periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z;\n";
+                out << tempType << " periodicDistance_rinv = RSQRT(periodicDistance_r2);\n";
                for (int j = 0; j < nodes.size(); j++) {
                    const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
                    int argIndex = -1;
@@ -123,17 +124,17 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                    if (argIndex == -1)
                        out << nodeNames[j] << " = RECIP(periodicDistance_rinv);\n";
                    else if (argIndex == 0)
-                        out << nodeNames[j] << " = periodicDistance_delta.x*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.x*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 1)
-                        out << nodeNames[j] << " = periodicDistance_delta.y*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.y*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 2)
-                        out << nodeNames[j] << " = periodicDistance_delta.z*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 3)
-                        out << nodeNames[j] << " = -periodicDistance_delta.x*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.x*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 4)
-                        out << nodeNames[j] << " = -periodicDistance_delta.y*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.y*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 5)
-                        out << nodeNames[j] << " = -periodicDistance_delta.z*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
                }
            }
            else {

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -106,21 +106,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) {
    // Create workspace arrays.
+    lastStepSize = make_double2(0.0, 0.0);
    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
        posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta");
        vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
        posDelta->upload(deltas);
        stepSize = CudaArray::create<double2>(context, 1, "stepSize");
-        vector<double2> step(1, make_double2(0.0, 0.0));
+        stepSize->upload(&lastStepSize);
-        stepSize->upload(step);
    }
    else {
        posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta");
        vector<float4> deltas(posDelta->getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
        posDelta->upload(deltas);
        stepSize = CudaArray::create<float2>(context, 1, "stepSize");
-        vector<float2> step(1, make_float2(0.0f, 0.0f));
+        float2 lastStepSizeFloat = make_float2(0.0f, 0.0f);
-        stepSize->upload(step);
+        stepSize->upload(&lastStepSizeFloat);
    }
    // Record the set of constraints and how many constraints each atom is involved in.
@@ -650,6 +650,29 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete vsiteLocalCoordsParams;
 }
+void CudaIntegrationUtilities::setNextStepSize(double size) {
+    if (size != lastStepSize.x || size != lastStepSize.y) {
+        lastStepSize = make_double2(size, size);
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            stepSize->upload(&lastStepSize);
+        else {
+            float2 lastStepSizeFloat = make_float2((float) size, (float) size);
+            stepSize->upload(&lastStepSizeFloat);
+        }
+    }
+}
+double CudaIntegrationUtilities::getLastStepSize() {
+    if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+        stepSize->download(&lastStepSize);
+    else {
+        float2 lastStepSizeFloat;
+        stepSize->download(&lastStepSizeFloat);
+        lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
+    }
+    return lastStepSize.y;
+}
 void CudaIntegrationUtilities::applyConstraints(double tol) {
    applyConstraints(false, tol);
 }

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1458,16 +1458,24 @@ private:
 class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
 public:
-    SyncStreamPostComputation(CudaContext& cu, CUevent event, int forceGroup) : cu(cu), event(event), forceGroup(forceGroup) {
+    SyncStreamPostComputation(CudaContext& cu, CUevent event, CUfunction addEnergyKernel, CudaArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
+            addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
    }
    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        if ((groups&(1<<forceGroup)) != 0)
            cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
+        if (includeEnergy) {
+            int bufferSize = pmeEnergyBuffer.getSize();
+            void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
+            cu.executeKernel(addEnergyKernel, args, bufferSize);
+        }
        return 0.0;
    }
 private:
    CudaContext& cu;
    CUevent event;
+    CUfunction addEnergyKernel;
+    CudaArray& pmeEnergyBuffer;
    int forceGroup;
 };
@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete pmeAtomRange;
    if (pmeAtomGridIndex != NULL)
        delete pmeAtomGridIndex;
+    if (pmeEnergyBuffer != NULL)
+        delete pmeEnergyBuffer;
    if (sort != NULL)
        delete sort;
    if (fft != NULL)
@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
                pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
+                int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
+                pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
+                cu.clearBuffer(*pmeEnergyBuffer);
                sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
                int cufftVersion;
                cufftGetVersion(&cufftVersion);
@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                    if (recipForceGroup < 0)
                        recipForceGroup = force.getForceGroup();
                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
-                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, recipForceGroup));
+                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
                }
                hasInitializedFFT = true;
@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        }
        if (includeEnergy) {
-            void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
+            void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
                    &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
                    cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
            cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
@@ -5674,7 +5687,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
    CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
    kernel1 = cu.getKernel(module, "integrateVerletPart1");
    kernel2 = cu.getKernel(module, "integrateVerletPart2");
-    prevStepSize = -1.0;
 }
 void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
@@ -5683,19 +5695,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
    int numAtoms = cu.getNumAtoms();
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    double dt = integrator.getStepSize();
-    if (dt != prevStepSize) {
+    cu.getIntegrationUtilities().setNextStepSize(dt);
-        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-            vector<double2> stepSizeVec(1);
-            stepSizeVec[0] = make_double2(dt, dt);
-            cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
-        }
-        else {
-            vector<float2> stepSizeVec(1);
-            stepSizeVec[0] = make_float2((float) dt, (float) dt);
-            cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
-        }
-        prevStepSize = dt;
-    }
    // Call the first integration kernel.
@@ -5752,6 +5752,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
    double temperature = integrator.getTemperature();
    double friction = integrator.getFriction();
    double stepSize = integrator.getStepSize();
+    cu.getIntegrationUtilities().setNextStepSize(stepSize);
    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
        // Calculate the integration parameters.
@@ -5766,8 +5767,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
            p[1] = fscale;
            p[2] = noisescale;
            params->upload(p);
-            double2 ss = make_double2(0, stepSize);
-            integration.getStepSize().upload(&ss);
        }
        else {
            vector<float> p(params->getSize());
@@ -5775,8 +5774,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
            p[1] = (float) fscale;
            p[2] = (float) noisescale;
            params->upload(p);
-            float2 ss = make_float2(0, (float) stepSize);
-            integration.getStepSize().upload(&ss);
        }
        prevTemp = temperature;
        prevFriction = friction;
@@ -5929,20 +5926,13 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
    // Update the time and step count.
-    double dt, time;
+    double dt = cu.getIntegrationUtilities().getLastStepSize();
+    double time = cu.getTime()+dt;
    if (useDouble) {
-        double2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
        if (dt == maxStepSize)
            time = maxTime; // Avoid round-off error
    }
    else {
-        float2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
        if (dt == maxStepSizeFloat)
            time = maxTime; // Avoid round-off error
    }
@@ -6023,20 +6013,13 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
    // Update the time and step count.
-    double dt, time;
+    double dt = cu.getIntegrationUtilities().getLastStepSize();
+    double time = cu.getTime()+dt;
    if (useDouble) {
-        double2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
        if (dt == maxStepSize)
            time = maxTime; // Avoid round-off error
    }
    else {
-        float2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
        if (dt == maxStepSizeFloat)
            time = maxTime; // Avoid round-off error
    }
@@ -6139,7 +6122,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
    summedValue = new CudaArray(cu, 1, elementSize, "summedValue");
    perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
    cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
-    prevStepSize = -1.0;
    SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
 }
@@ -6553,9 +6535,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
    }
    localValuesAreCurrent = false;
    double stepSize = integrator.getStepSize();
-    if (stepSize != prevStepSize) {
+    recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
-        recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
-    }
    for (int i = 0; i < (int) parameterNames.size(); i++) {
        double value = context.getParameter(parameterNames[i]);
        if (value != globalValuesDouble[parameterVariableIndex[i]]) {
@@ -6760,17 +6740,10 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
 void CudaIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target) {
    switch (target.type) {
        case DT:
+            if (value != globalValuesDouble[dtVariableIndex])
+                deviceGlobalsAreCurrent = false;
            globalValuesDouble[dtVariableIndex] = value;
-            deviceGlobalsAreCurrent = false;
+            cu.getIntegrationUtilities().setNextStepSize(value);
-            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-                double size[] = {0, value};
-                cu.getIntegrationUtilities().getStepSize().upload(size);
-            }
-            else {
-                float size[] = {0, (float) value};
-                cu.getIntegrationUtilities().getStepSize().upload(size);
-            }
-            prevStepSize = value;
            break;
        case VARIABLE:
        case PARAMETER:

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -179,7 +179,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 }
 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;
@@ -247,8 +247,11 @@ CudaPlatform::PlatformData::~PlatformData() {
 }
 void CudaPlatform::PlatformData::initializeContexts(const System& system) {
+    if (hasInitializedContexts)
+        return;
    for (int i = 0; i < (int) contexts.size(); i++)
        contexts[i]->initialize();
+    hasInitializedContexts = true;
 }
 void CudaPlatform::PlatformData::syncContexts() {

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -188,7 +188,7 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
        }
    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
 }
 extern "C" __global__
@@ -286,3 +286,9 @@ void addForces(const real4* __restrict__ forces, unsigned long long* __restrict_
        forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.z*0x100000000));
    }
 }
+extern "C" __global__
+void addEnergy(const mixed* __restrict__ pmeEnergyBuffer, mixed* __restrict__ energyBuffer, int bufferSize) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < bufferSize; i += blockDim.x*gridDim.x)
+        energyBuffer[i] += pmeEnergyBuffer[i];
+}
--- a/platforms/cuda/tests/TestCudaCompoundIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaCompoundIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CudaTests.h"
+#include "TestCompoundIntegrator.h"
+void runPlatformTests() {
+}
--- a/platforms/opencl/include/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/include/OpenCLIntegrationUtilities.h
@@ -62,6 +62,14 @@ public:
    OpenCLArray& getStepSize() {
        return *stepSize;
    }
+    /**
+     * Set the size to use for the next step.
+     */
+    void setNextStepSize(double size);
+    /**
+     * Get the size that was used for the last step.
+     */
+    double getLastStepSize();
    /**
     * Apply constraints to the atom positions.
     *
@@ -153,6 +161,7 @@ private:
    int randomPos;
    int lastSeed, numVsites;
    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites;
+    mm_double2 lastStepSize;
    struct ShakeCluster;
    struct ConstraintOrderer;
 };

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -570,7 +570,7 @@ public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform),
            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
-            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
    /**
@@ -636,12 +636,14 @@ private:
    OpenCLArray* pmeBsplineTheta;
    OpenCLArray* pmeAtomRange;
    OpenCLArray* pmeAtomGridIndex;
+    OpenCLArray* pmeEnergyBuffer;
    OpenCLSort* sort;
    cl::CommandQueue pmeQueue;
    cl::Event pmeSyncEvent;
    OpenCLFFT3D* fft;
    Kernel cpuPme;
    PmeIO* pmeio;
+    SyncQueuePostComputation* syncQueue;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;
    cl::Kernel pmeGridIndexKernel;
@@ -1103,7 +1105,6 @@ public:
    double computeKineticEnergy(ContextImpl& context, const VerletIntegrator& integrator);
 private:
    OpenCLContext& cl;
-    double prevStepSize;
    bool hasInitializedKernels;
    cl::Kernel kernel1, kernel2;
 };
@@ -1342,7 +1343,7 @@ private:
    void recordChangedParameters(ContextImpl& context);
    bool evaluateCondition(int step);
    OpenCLContext& cl;
-    double prevStepSize, energy;
+    double energy;
    float energyFloat;
    int numGlobalVariables;
    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints;

--- a/platforms/opencl/include/OpenCLPlatform.h
+++ b/platforms/opencl/include/OpenCLPlatform.h
@@ -106,7 +106,7 @@ public:
    ContextImpl* context;
    std::vector<OpenCLContext*> contexts;
    std::vector<double> contextEnergy;
-    bool removeCM, useCpuPme;
+    bool hasInitializedContexts, removeCM, useCpuPme;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;