Conflict resolution in TestSplineFilter.cpp

0b5d58d7 · Charlles Abreu · 9026dbe7 · b0d13582 · 0b5d58d7 · 0b5d58d7
Commit 0b5d58d7 authored May 27, 2020 by Charlles Abreu
20 changed files
--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -87,7 +87,7 @@ OpenCLPlatform::OpenCLPlatform() {
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateVelocityVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateNoseHooverStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
@@ -95,7 +95,6 @@ OpenCLPlatform::OpenCLPlatform() {
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
-    registerKernelFactory(NoseHooverChainKernel::Name(), factory);
    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
    platformProperties.push_back(OpenCLDeviceIndex());

--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -64,10 +64,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
    unsigned int maxShortList = min(8192, max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()));
-    // On Qualcomm's OpenCL, it's essential to check against CL_KERNEL_WORK_GROUP_SIZE.  Otherwise you get a crash.
-    // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
-    // maximum, so including the check hurts performance.  For the moment I'm going to just comment it out.
-    // If we officially support Qualcomm in the future, we'll need to do something better.
+    // The following line checks CL_KERNEL_WORK_GROUP_SIZE to make sure we don't create too large a workgroup.
+    // Unfortunately, AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
+    // maximum, so including the check hurts performance.  For the moment I'm just leaving it commented out.
+    // If the workgroup size turns out to be too large, we catch the exception and switch back to the standard
+    // sorting kernels.
    //maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    isShortList = (length <= maxShortList);
    string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
@@ -92,12 +93,10 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le

    // Create workspace arrays.

-    if (!isShortList) {
-        dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
-        bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
-        bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
-        offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
-    }
+    dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
+    bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
+    bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
+    offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
    buckets.initialize(context, length, trait->getDataSize(), "buckets");
 }

@@ -113,68 +112,76 @@ void OpenCLSort::sort(OpenCLArray& data) {
    if (isShortList) {
        // We can use a simpler sort kernel that does the entire operation in one kernel.
        
-        if (useShortList2) {
-            shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-            shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
-            shortList2Kernel.setArg<cl_int>(2, dataLength);
-            context.executeKernel(shortList2Kernel, dataLength);
-            buckets.copyTo(data);
+        try {
+            if (useShortList2) {
+                shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+                shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
+                shortList2Kernel.setArg<cl_int>(2, dataLength);
+                context.executeKernel(shortList2Kernel, dataLength);
+                buckets.copyTo(data);
+            }
+            else {
+                shortListKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+                shortListKernel.setArg<cl_uint>(1, dataLength);
+                shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
+                context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
+            }
+            return;
        }
-        else {
-            shortListKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-            shortListKernel.setArg<cl_uint>(1, dataLength);
-            shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
-            context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
+        catch (exception& ex) {
+            // This can happen if we chose too large a size for the kernel.  Switch
+            // over to the standard sorting method.
+            
+            isShortList = false;
        }
    }
-    else {
-        // Compute the range of data values.
-
-        unsigned int numBuckets = bucketOffset.getSize();
-        computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        computeRangeKernel.setArg<cl_uint>(1, data.getSize());
-        computeRangeKernel.setArg<cl::Buffer>(2, dataRange.getDeviceBuffer());
-        computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL);
-        computeRangeKernel.setArg(4, rangeKernelSize*trait->getKeySize(), NULL);
-        computeRangeKernel.setArg<cl_int>(5, numBuckets);
-        computeRangeKernel.setArg<cl::Buffer>(6, bucketOffset.getDeviceBuffer());
-        context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
-
-        // Assign array elements to buckets.
-
-        assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        assignElementsKernel.setArg<cl_int>(1, data.getSize());
-        assignElementsKernel.setArg<cl_int>(2, numBuckets);
-        assignElementsKernel.setArg<cl::Buffer>(3, dataRange.getDeviceBuffer());
-        assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset.getDeviceBuffer());
-        assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement.getDeviceBuffer());
-        assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket.getDeviceBuffer());
-        context.executeKernel(assignElementsKernel, data.getSize());
-
-        // Compute the position of each bucket.
-
-        computeBucketPositionsKernel.setArg<cl_int>(0, numBuckets);
-        computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset.getDeviceBuffer());
-        computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL);
-        context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize);
-
-        // Copy the data into the buckets.
-
-        copyToBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        copyToBucketsKernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
-        copyToBucketsKernel.setArg<cl_int>(2, data.getSize());
-        copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
-        copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement.getDeviceBuffer());
-        copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket.getDeviceBuffer());
-        context.executeKernel(copyToBucketsKernel, data.getSize());
-
-        // Sort each bucket.
-
-        sortBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        sortBucketsKernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
-        sortBucketsKernel.setArg<cl_int>(2, numBuckets);
-        sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
-        sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL);
-        context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
-    }
+
+    // Compute the range of data values.
+
+    unsigned int numBuckets = bucketOffset.getSize();
+    computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+    computeRangeKernel.setArg<cl_uint>(1, data.getSize());
+    computeRangeKernel.setArg<cl::Buffer>(2, dataRange.getDeviceBuffer());
+    computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL);
+    computeRangeKernel.setArg(4, rangeKernelSize*trait->getKeySize(), NULL);
+    computeRangeKernel.setArg<cl_int>(5, numBuckets);
+    computeRangeKernel.setArg<cl::Buffer>(6, bucketOffset.getDeviceBuffer());
+    context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
+
+    // Assign array elements to buckets.
+
+    assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+    assignElementsKernel.setArg<cl_int>(1, data.getSize());
+    assignElementsKernel.setArg<cl_int>(2, numBuckets);
+    assignElementsKernel.setArg<cl::Buffer>(3, dataRange.getDeviceBuffer());
+    assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset.getDeviceBuffer());
+    assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement.getDeviceBuffer());
+    assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket.getDeviceBuffer());
+    context.executeKernel(assignElementsKernel, data.getSize());
+
+    // Compute the position of each bucket.
+
+    computeBucketPositionsKernel.setArg<cl_int>(0, numBuckets);
+    computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset.getDeviceBuffer());
+    computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL);
+    context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize);
+
+    // Copy the data into the buckets.
+
+    copyToBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+    copyToBucketsKernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
+    copyToBucketsKernel.setArg<cl_int>(2, data.getSize());
+    copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
+    copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement.getDeviceBuffer());
+    copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket.getDeviceBuffer());
+    context.executeKernel(copyToBucketsKernel, data.getSize());
+
+    // Sort each bucket.
+
+    sortBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+    sortBucketsKernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
+    sortBucketsKernel.setArg<cl_int>(2, numBuckets);
+    sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
+    sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL);
+    context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
 }
--- a/platforms/opencl/tests/TestOpenCLNoseHooverThermostat.cpp
+++ b/platforms/opencl/tests/TestOpenCLNoseHooverThermostat.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2019 Stanford University and the Authors.           *
- * Authors: Andreas Krämer and Andrew C. Simmonett                            *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include "OpenCLTests.h"
-#include "TestNoseHooverThermostat.h"
-
-void runPlatformTests() {
-}
--- a/platforms/reference/include/ReferenceKernels.h
+++ b/platforms/reference/include/ReferenceKernels.h
@@ -61,7 +61,7 @@ class ReferenceStochasticDynamics;
 class ReferenceConstraintAlgorithm;
 class ReferenceNoseHooverChain;
 class ReferenceMonteCarloBarostat;
-class ReferenceVelocityVerletDynamics;
+class ReferenceNoseHooverDynamics;
 class ReferenceVariableStochasticDynamics;
 class ReferenceVariableVerletDynamics;
 class ReferenceVerletDynamics;
@@ -1139,12 +1139,12 @@ private:
 /**
 * This kernel is invoked by NoseHooverIntegrator to take one time step.
 */
-class ReferenceIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
+class ReferenceIntegrateNoseHooverStepKernel : public IntegrateNoseHooverStepKernel {
 public:
-    ReferenceIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, ReferencePlatform::PlatformData& data) : IntegrateVelocityVerletStepKernel(name, platform),
+    ReferenceIntegrateNoseHooverStepKernel(std::string name, const Platform& platform, ReferencePlatform::PlatformData& data) : IntegrateNoseHooverStepKernel(name, platform),
        data(data), dynamics(0) {
    }
-    ~ReferenceIntegrateVelocityVerletStepKernel();
+    ~ReferenceIntegrateNoseHooverStepKernel();
    /**
     * Initialize the kernel.
     * 
@@ -1168,10 +1168,56 @@ public:
     * @param integrator the NoseHooverIntegrator this kernel is being used for
     */
    double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
+    /**
+     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
+     * 
+     * @param context  the context in which to execute this kernel
+     * @param noseHooverChain the object describing the chain to be propagated.
+     * @param kineticEnergy the {center of mass, relative} kineticEnergies of the particles being thermostated by this chain.
+     * @param timeStep the time step used by the integrator.
+     * @return the velocity scale factor to apply to the particles associated with this heat bath.
+     */
+    std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> kineticEnergy, double timeStep);
+    /**
+     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @return the total heat bath energy.
+     */
+    double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain);
+    /**
+     * Execute the kernel that computes the kinetic energy for a subset of atoms,
+     * or the relative kinetic energy of Drude particles with respect to their parent atoms
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param downloadValue whether the computed value should be downloaded and returned.
+     */
+    std::pair<double, double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
+    /**
+     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param scaleFactor the multiplicative factor by which {absolute, relative} velocities are scaled.
+     */
+    void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactor);
+    /**
+     * Write the chain states to a checkpoint.
+     */
+    void createCheckpoint(ContextImpl& context, std::ostream& stream) const;
+    /**
+     * Load the chain states from a checkpoint.
+     */
+    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
    ReferencePlatform::PlatformData& data;
-    ReferenceVelocityVerletDynamics* dynamics;
+    ReferenceNoseHooverChain* chainPropagator;
+    ReferenceNoseHooverDynamics* dynamics;
    std::vector<double> masses;
+    std::vector<std::vector<double> > chainPositions;
+    std::vector<std::vector<double> > chainVelocities;
    double prevStepSize;
 };

@@ -1466,62 +1512,6 @@ private:
    std::vector<double> masses;
 };

-/**
- * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
- * and update the associated particle velocities.
- */
-class ReferenceNoseHooverChainKernel : public NoseHooverChainKernel {
-public:
-    ReferenceNoseHooverChainKernel(std::string name, const Platform& platform) : NoseHooverChainKernel(name, platform), chainPropagator(0) {
-    }
-    ~ReferenceNoseHooverChainKernel();
-    /**
-     * Initialize the kernel.
-     */
-    virtual void initialize();
-    /**
-     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
-     * 
-     * @param context  the context in which to execute this kernel
-     * @param noseHooverChain the object describing the chain to be propagated.
-     * @param kineticEnergy the {absolute, relative} kinetic energies of the particles being thermostated by this chain.
-     * @param timeStep the time step used by the integrator.
-     * @return the velocity scale factors to apply to the {absolute, relative} motion of particles associated with this heat bath.
-     */
-    virtual std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergy, double timeStep);
-    /**
-     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @return the total heat bath energy.
-     */
-    virtual double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
-    /**
-     * Execute the kernel that computes the kinetic energy for a subset of atoms,
-     * or the relative kinetic energy of Drude particles with respect to their parent atoms
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @param downloadValue whether the computed value should be downloaded and returned.
-     *
-     */
-     virtual std::pair<double, double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
-
-    /**
-     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @param scaleFactor the multiplicative factor by which {absolute, relative} velocities are scaled.
-     */
-    virtual void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactor);
-
-
-private:
-    ReferenceNoseHooverChain* chainPropagator;
-};
-
 /**
 * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
 */

--- a/platforms/reference/include/ReferenceVelocityVerletDynamics.h
+++ b/platforms/reference/include/ReferenceVelocityVerletDynamics.h

-/* Portions copyright (c) 2006-2012 Stanford University and Simbios.
- * Contributors: Pande Group
+/* Portions copyright (c) 2006-2020 Stanford University and Simbios.
+ * Contributors: Andy Simmonett, Peter Eastman, Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@@ -22,21 +22,24 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#ifndef __ReferenceVelocityVerletDynamics_H__
-#define __ReferenceVelocityVerletDynamics_H__
+#ifndef __ReferenceNoseHooverDynamics_H__
+#define __ReferenceNoseHooverDynamics_H__

 #include "ReferenceDynamics.h"
+#include <tuple>

 namespace OpenMM {

 class ContextImpl;

-class ReferenceVelocityVerletDynamics : public ReferenceDynamics {
+class ReferenceNoseHooverDynamics : public ReferenceDynamics {

   private:
      std::vector<OpenMM::Vec3> xPrime;
+      std::vector<OpenMM::Vec3> oldx;
      std::vector<double> inverseMasses;
-      
+      int numberOfAtoms;
+
   public:

      /**---------------------------------------------------------------------------------------
@@ -50,7 +53,7 @@ class ReferenceVelocityVerletDynamics : public ReferenceDynamics {
      
         --------------------------------------------------------------------------------------- */

-       ReferenceVelocityVerletDynamics(int numberOfAtoms, double deltaT);
+       ReferenceNoseHooverDynamics(int numberOfAtoms, double deltaT);

      /**---------------------------------------------------------------------------------------
      
@@ -58,11 +61,11 @@ class ReferenceVelocityVerletDynamics : public ReferenceDynamics {
      
         --------------------------------------------------------------------------------------- */

-       ~ReferenceVelocityVerletDynamics();
+       ~ReferenceNoseHooverDynamics();

      /**---------------------------------------------------------------------------------------
      
-         Update
+         Perform the first half of a step using the leapfrog LF-Middle scheme
      
         @param system              the System to be integrated
         @param atomCoordinates     atom coordinates
@@ -77,12 +80,31 @@ class ReferenceVelocityVerletDynamics : public ReferenceDynamics {
      
         --------------------------------------------------------------------------------------- */
     
-      void update(OpenMM::ContextImpl &context, const OpenMM::System& system, std::vector<OpenMM::Vec3>& atomCoordinates,
-                  std::vector<OpenMM::Vec3>& velocities, std::vector<OpenMM::Vec3>& forces, std::vector<double>& masses, double tolerance, bool &forcesAreValid,
-                  const std::vector<int> & allAtoms, const std::vector<std::tuple<int, int, double>> & allPairs, double maxPairDistance);
+      void step1(OpenMM::ContextImpl &context, const OpenMM::System& system, std::vector<OpenMM::Vec3>& atomCoordinates,
+                 std::vector<OpenMM::Vec3>& velocities, std::vector<OpenMM::Vec3>& forces, std::vector<double>& masses, double tolerance, bool &forcesAreValid,
+                 const std::vector<int> & allAtoms, const std::vector<std::tuple<int, int, double>> & allPairs, double maxPairDistance);
+      /**---------------------------------------------------------------------------------------
+      
+         Perform the second half of a step using the leapfrog LF-Middle scheme
+      
+         @param system              the System to be integrated
+         @param atomCoordinates     atom coordinates
+         @param velocities          velocities
+         @param forces              forces
+         @param masses              atom masses
+         @param tolerance           the constraint tolerance
+         @param forcesAreValid      whether the forces are valid (duh!)
+         @param allAtoms            a list of all atoms not involved in a Drude-like pair
+         @param allPairs            a list of all Drude-like pairs, and their KT values, in the system
+         @param maxPairDistance     the maximum separation allowed for a Drude-like pair
+      
+         --------------------------------------------------------------------------------------- */
+      void step2(OpenMM::ContextImpl &context, const OpenMM::System& system, std::vector<OpenMM::Vec3>& atomCoordinates,
+                 std::vector<OpenMM::Vec3>& velocities, std::vector<OpenMM::Vec3>& forces, std::vector<double>& masses, double tolerance, bool &forcesAreValid,
+                 const std::vector<int> & allAtoms, const std::vector<std::tuple<int, int, double>> & allPairs, double maxPairDistance);
      
 };

 } // namespace OpenMM

-#endif // __ReferenceVelocityVerletDynamics_H__
+#endif // __ReferenceNoseHooverDynamics_H__
--- a/platforms/reference/include/ReferencePlatform.h
+++ b/platforms/reference/include/ReferencePlatform.h
@@ -72,8 +72,6 @@ public:
    Vec3* periodicBoxVectors;
    ReferenceConstraints* constraints;
    std::map<std::string, double>* energyParameterDerivatives;
-    std::vector<std::vector<double>>* noseHooverPositions;
-    std::vector<std::vector<double>>* noseHooverVelocities;
 };
 } // namespace OpenMM


--- a/platforms/reference/src/ReferenceKernelFactory.cpp
+++ b/platforms/reference/src/ReferenceKernelFactory.cpp
@@ -88,10 +88,8 @@ KernelImpl* ReferenceKernelFactory::createKernelImpl(std::string name, const Pla
        return new ReferenceCalcGayBerneForceKernel(name, platform);
    if (name == IntegrateVerletStepKernel::Name())
        return new ReferenceIntegrateVerletStepKernel(name, platform, data);
-    if (name == IntegrateVelocityVerletStepKernel::Name())
-        return new ReferenceIntegrateVelocityVerletStepKernel(name, platform, data);
-    if (name == NoseHooverChainKernel::Name())
-        return new ReferenceNoseHooverChainKernel(name, platform);
+    if (name == IntegrateNoseHooverStepKernel::Name())
+        return new ReferenceIntegrateNoseHooverStepKernel(name, platform, data);
    if (name == IntegrateLangevinStepKernel::Name())
        return new ReferenceIntegrateLangevinStepKernel(name, platform, data);
    if (name == IntegrateLangevinMiddleStepKernel::Name())

--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -57,6 +57,7 @@
 #include "ReferenceLJCoulombIxn.h"
 #include "ReferenceMonteCarloBarostat.h"
 #include "ReferenceNoseHooverChain.h"
+#include "ReferenceNoseHooverDynamics.h"
 #include "ReferenceProperDihedralBond.h"
 #include "ReferenceRbDihedralBond.h"
 #include "ReferenceRMSDForce.h"
@@ -64,7 +65,6 @@
 #include "ReferenceTabulatedFunction.h"
 #include "ReferenceVariableStochasticDynamics.h"
 #include "ReferenceVariableVerletDynamics.h"
-#include "ReferenceVelocityVerletDynamics.h"
 #include "ReferenceVerletDynamics.h"
 #include "ReferenceVirtualSites.h"
 #include "openmm/CMMotionRemover.h"
@@ -127,15 +127,6 @@ static map<string, double>& extractEnergyParameterDerivatives(ContextImpl& conte
    return *data->energyParameterDerivatives;
 }

-static vector<vector<double> >& extractNoseHooverPositions(ContextImpl& context) {
-    ReferencePlatform::PlatformData *data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());  
-    return *((vector<vector<double> >*) data->noseHooverPositions);
-}
-
-static vector<vector<double> >& extractNoseHooverVelocities(ContextImpl& context) {
-    ReferencePlatform::PlatformData *data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());  
-    return *((vector<vector<double> >*) data->noseHooverVelocities);
-}
 /**
 * Make sure an expression doesn't use any undefined variables.
 */
@@ -297,20 +288,6 @@ void ReferenceUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostr
    stream.write((char*) &velData[0], sizeof(Vec3)*velData.size());
    Vec3* vectors = extractBoxVectors(context);
    stream.write((char*) vectors, 3*sizeof(Vec3));
-    auto& allNoseHooverPositions = extractNoseHooverPositions(context);
-    auto& allNoseHooverVelocities = extractNoseHooverVelocities(context);
-    size_t numChains = allNoseHooverPositions.size();
-    assert(numChains == allNoseHooverVelocities.size());
-    stream.write((char*) &numChains, sizeof(size_t));
-    for (size_t i=0; i<numChains; i++){
-        auto & noseHooverPositions = allNoseHooverPositions.at(i);
-        auto & noseHooverVelocities = allNoseHooverVelocities.at(i);
-        size_t numBeads = noseHooverPositions.size();
-        assert(numBeads == noseHooverVelocities.size());
-        stream.write((char*) &numBeads, sizeof(size_t));
-        stream.write((char*) noseHooverPositions.data(), sizeof(double)*numBeads);
-        stream.write((char*) noseHooverVelocities.data(), sizeof(double)*numBeads);
-    }
    SimTKOpenMMUtilities::createCheckpoint(stream);
 }

@@ -326,21 +303,6 @@ void ReferenceUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istrea
    stream.read((char*) &velData[0], sizeof(Vec3)*velData.size());
    Vec3* vectors = extractBoxVectors(context);
    stream.read((char*) vectors, 3*sizeof(Vec3));
-    size_t numChains, numBeads;
-    auto& allNoseHooverPositions = extractNoseHooverPositions(context);
-    auto& allNoseHooverVelocities = extractNoseHooverVelocities(context);
-    stream.read((char*) &numChains, sizeof(size_t));
-    allNoseHooverPositions.clear();
-    allNoseHooverVelocities.clear();
-    for (size_t i=0; i<numChains; i++){
-        stream.read((char*) &numBeads, sizeof(size_t));
-        std::vector<double> noseHooverPositions(numBeads);
-        std::vector<double> noseHooverVelocities(numBeads);
-        stream.read((char*) &noseHooverPositions[0], sizeof(double)*numBeads);
-        stream.read((char*) &noseHooverVelocities[0], sizeof(double)*numBeads);
-        allNoseHooverPositions.push_back(noseHooverPositions);
-        allNoseHooverVelocities.push_back(noseHooverVelocities);
-    }
    SimTKOpenMMUtilities::loadCheckpoint(stream);
 }

@@ -2153,19 +2115,22 @@ double ReferenceIntegrateVerletStepKernel::computeKineticEnergy(ContextImpl& con
    return computeShiftedKineticEnergy(context, masses, 0.5*integrator.getStepSize());
 }

-ReferenceIntegrateVelocityVerletStepKernel::~ReferenceIntegrateVelocityVerletStepKernel() {
+ReferenceIntegrateNoseHooverStepKernel::~ReferenceIntegrateNoseHooverStepKernel() {
+    if (chainPropagator)
+        delete chainPropagator;
    if (dynamics)
        delete dynamics;
 }

-void ReferenceIntegrateVelocityVerletStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
+void ReferenceIntegrateNoseHooverStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
    int numParticles = system.getNumParticles();
    masses.resize(numParticles);
    for (int i = 0; i < numParticles; ++i)
        masses[i] = system.getParticleMass(i);
+    this->chainPropagator = new ReferenceNoseHooverChain();
 }

-void ReferenceIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
+void ReferenceIntegrateNoseHooverStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
    double stepSize = integrator.getStepSize();
    vector<Vec3>& posData = extractPositions(context);
    vector<Vec3>& velData = extractVelocities(context);
@@ -2174,20 +2139,234 @@ void ReferenceIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, c
        // Recreate the computation objects with the new parameters.
        if (dynamics)
            delete dynamics;
-        dynamics = new ReferenceVelocityVerletDynamics(context.getSystem().getNumParticles(), stepSize);
+        dynamics = new ReferenceNoseHooverDynamics(context.getSystem().getNumParticles(), stepSize);
        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
        prevStepSize = stepSize;
    }
-    dynamics->update(context, context.getSystem(), posData, velData, forceData, masses, integrator.getConstraintTolerance(), forcesAreValid,
+    dynamics->step1(context, context.getSystem(), posData, velData, forceData, masses, integrator.getConstraintTolerance(), forcesAreValid,
+                     integrator.getAllThermostatedIndividualParticles(), integrator.getAllThermostatedPairs(), integrator.getMaximumPairDistance());
+    int numChains = integrator.getNumThermostats();
+    for(int chain = 0; chain < numChains; ++chain) {
+        const auto &thermostatChain = integrator.getThermostat(chain);
+        std::pair<double, double> KEs = computeMaskedKineticEnergy(context, thermostatChain, true);
+        std::pair<double, double> scaleFactors = propagateChain(context, thermostatChain, KEs, stepSize);
+        scaleVelocities(context, thermostatChain, scaleFactors);
+    }
+    dynamics->step2(context, context.getSystem(), posData, velData, forceData, masses, integrator.getConstraintTolerance(), forcesAreValid,
                     integrator.getAllThermostatedIndividualParticles(), integrator.getAllThermostatedPairs(), integrator.getMaximumPairDistance());
    data.time += stepSize;
    data.stepCount++;
 }

-double ReferenceIntegrateVelocityVerletStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
+double ReferenceIntegrateNoseHooverStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
    return computeShiftedKineticEnergy(context, masses, 0);
 }

+std::pair<double, double> ReferenceIntegrateNoseHooverStepKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc,
+                                                                                     std::pair<double, double> kineticEnergy, double timeStep) {
+    double absKE = kineticEnergy.first;
+    double relKE = kineticEnergy.second;
+    if (absKE < 1e-8) return {1.0, 1.0};  // (catches the problem of zero velocities in the first dynamics step, where we have nothing to scale)
+    // Get the variables describing the NHC
+    int chainLength = nhc.getChainLength();
+    int chainID = nhc.getChainID();
+    int numDOFs = nhc.getNumDegreesOfFreedom();
+    int numMTS = nhc.getNumMultiTimeSteps();
+
+    int nAtoms = nhc.getThermostatedAtoms().size();
+    double absScale = 0;
+    if (nAtoms) {
+        if (chainPositions.size() < 2*chainID+1){
+            chainPositions.resize(2*chainID+1);
+        }
+        if (chainVelocities.size() < 2*chainID+1){
+            chainVelocities.resize(2*chainID+1);
+        }
+        auto& positions = chainPositions.at(2*chainID);
+        auto& velocities = chainVelocities.at(2*chainID);
+        if (positions.size() < chainLength){
+            positions.resize(chainLength, 0);
+        }
+        if (velocities.size() < chainLength){
+            velocities.resize(chainLength, 0);
+        }
+        double temperature = nhc.getTemperature();
+        double collisionFrequency = nhc.getCollisionFrequency();
+        absScale = chainPropagator->propagate(absKE, velocities, positions, numDOFs,
+                                              temperature, collisionFrequency, timeStep,
+                                              numMTS, nhc.getYoshidaSuzukiWeights());
+    }
+    double relScale = 0;
+    int nPairs = nhc.getThermostatedPairs().size();
+    if (nPairs) {
+        if (chainPositions.size() < 2*chainID+2){
+            chainPositions.resize(2*chainID+2);
+        }
+        if (chainVelocities.size() < 2*chainID+2){
+            chainVelocities.resize(2*chainID+2);
+        }
+        auto& positions = chainPositions.at(2*chainID+1);
+        auto& velocities = chainVelocities.at(2*chainID+1);
+        if (positions.size() < chainLength){
+            positions.resize(chainLength, 0);
+        }
+        if (velocities.size() < chainLength){
+            velocities.resize(chainLength, 0);
+        }
+        double temperature = nhc.getRelativeTemperature();
+        double collisionFrequency = nhc.getRelativeCollisionFrequency();
+        relScale = chainPropagator->propagate(relKE, velocities, positions, 3*nPairs,
+                                              temperature, collisionFrequency, timeStep,
+                                              numMTS, nhc.getYoshidaSuzukiWeights());
+    }
+    return {absScale, relScale};
+}
+
+double ReferenceIntegrateNoseHooverStepKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
+    double potentialEnergy = 0;
+    double kineticEnergy = 0;
+    int chainLength = nhc.getChainLength();
+    int chainID = nhc.getChainID();
+    int nAtoms = nhc.getThermostatedAtoms().size();
+    int nPairs = nhc.getThermostatedPairs().size();
+    if (nAtoms) {
+        double temperature = nhc.getTemperature();
+        double collisionFrequency = nhc.getCollisionFrequency();
+        double kT = temperature * BOLTZ;
+        int numDOFs = nhc.getNumDegreesOfFreedom();
+        for(int i = 0; i < chainLength; ++i) {
+            double prefac = i ? 1 : numDOFs;
+            double mass = prefac * kT / (collisionFrequency * collisionFrequency);
+            double velocity = chainVelocities[2*chainID][i];
+            // The kinetic energy of this bead
+            kineticEnergy += 0.5 * mass * velocity * velocity;
+            // The potential energy of this bead
+            double position = chainPositions[2*chainID][i];
+            potentialEnergy += prefac * kT * position;
+        }
+    }
+    if (nPairs) {
+        double temperature = nhc.getRelativeTemperature();
+        double collisionFrequency = nhc.getRelativeCollisionFrequency();
+        double kT = temperature * BOLTZ;
+        int numDOFs = 3 * nPairs;
+        for(int i = 0; i < chainLength; ++i) {
+            double prefac = i ? 1 : numDOFs;
+            double mass = prefac * kT / (collisionFrequency * collisionFrequency);
+            double velocity = chainVelocities[2*chainID+1][i];
+            // The kinetic energy of this bead
+            kineticEnergy += 0.5 * mass * velocity * velocity;
+            // The potential energy of this bead
+            double position = chainPositions[2*chainID+1][i];
+            potentialEnergy += prefac * kT * position;
+        }
+    }
+    return kineticEnergy + potentialEnergy;
+}
+
+std::pair<double, double> ReferenceIntegrateNoseHooverStepKernel::computeMaskedKineticEnergy(ContextImpl& context,
+                                                                const NoseHooverChain &noseHooverChain, bool downloadValue) {
+    const std::vector<int>& atomsList = noseHooverChain.getThermostatedAtoms();
+    const std::vector<std::pair<int,int>>& pairsList = noseHooverChain.getThermostatedPairs();
+    std::vector<Vec3>& velocities = extractVelocities(context);
+    const System& system = context.getSystem();
+    int numParticles = system.getNumParticles();
+    std::vector<double> masses(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        masses[i] = system.getParticleMass(i);
+
+    double comKE = 0;
+    double relKE = 0;
+    // kinetic energy of individual atoms
+    for (const auto &m: atomsList){
+        comKE += 0.5 * masses[m] * velocities[m].dot(velocities[m]);
+    }
+    // center of mass kinetic energy of pairs
+    for (const auto &p: pairsList){
+        double m1 = masses[p.first];
+        double m2 = masses[p.second];
+        Vec3 v1 = velocities[p.first];
+        Vec3 v2 = velocities[p.second];
+        double invMass = 1.0 / (m1 + m2);
+        double redMass = m1 * m2 * invMass;
+        double fracM1 = m1 * invMass;
+        double fracM2 = m2 * invMass;
+        Vec3 comVelocity = fracM1 * v1 + fracM2 * v2;
+        Vec3 relVelocity = v2 - v1;
+
+        comKE += 0.5 * (m1 + m2) * comVelocity.dot(comVelocity);
+        relKE += 0.5 * redMass * relVelocity.dot(relVelocity);
+    }
+    // We ignore the downloadValue argument here and always return the correct value
+    return {comKE, relKE};
+}
+
+
+void ReferenceIntegrateNoseHooverStepKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors) {
+    const auto& atoms = noseHooverChain.getThermostatedAtoms();
+    const auto& pairs = noseHooverChain.getThermostatedPairs();
+    std::vector<Vec3>& velocities = extractVelocities(context);
+    double absScale = scaleFactors.first;
+    double relScale = scaleFactors.second;
+
+    const System& system = context.getSystem();
+    int numParticles = system.getNumParticles();
+    std::vector<double> masses(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        masses[i] = system.getParticleMass(i);
+    // scale absolute velocities
+    for (const auto &a: atoms){
+        velocities[a] *= absScale;
+    }
+    // scale relative velocities and absolute center of mass velocities for each pair
+    for (const auto &p: pairs){
+        int p1 = p.first;
+        int p2 = p.second;
+        double m1 = masses[p.first];
+        double m2 = masses[p.second];
+        Vec3 v1 = velocities[p.first];
+        Vec3 v2 = velocities[p.second];
+        double invMass = 1.0 / (m1 + m2);
+        double fracM1 = m1 * invMass;
+        double fracM2 = m2 * invMass;
+        Vec3 comVelocity = fracM1 * v1 + fracM2 * v2;
+        Vec3 relVelocity = v2 - v1;
+        velocities[p1] = absScale * comVelocity - relScale * relVelocity * fracM2;
+        velocities[p2] = absScale * comVelocity + relScale * relVelocity * fracM1;
+    }
+}
+
+void ReferenceIntegrateNoseHooverStepKernel::createCheckpoint(ContextImpl& context, ostream& stream) const {
+    size_t numChains = chainPositions.size();
+    assert(numChains == chainVelocities.size());
+    stream.write((char*) &numChains, sizeof(size_t));
+    for (size_t i=0; i<numChains; i++){
+        auto & noseHooverPositions = chainPositions.at(i);
+        auto & noseHooverVelocities = chainVelocities.at(i);
+        size_t numBeads = noseHooverPositions.size();
+        assert(numBeads == noseHooverVelocities.size());
+        stream.write((char*) &numBeads, sizeof(size_t));
+        stream.write((char*) noseHooverPositions.data(), sizeof(double)*numBeads);
+        stream.write((char*) noseHooverVelocities.data(), sizeof(double)*numBeads);
+    }
+}
+
+void ReferenceIntegrateNoseHooverStepKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
+    size_t numChains, numBeads;
+    stream.read((char*) &numChains, sizeof(size_t));
+    chainPositions.clear();
+    chainVelocities.clear();
+    for (size_t i=0; i<numChains; i++){
+        stream.read((char*) &numBeads, sizeof(size_t));
+        std::vector<double> noseHooverPositions(numBeads);
+        std::vector<double> noseHooverVelocities(numBeads);
+        stream.read((char*) &noseHooverPositions[0], sizeof(double)*numBeads);
+        stream.read((char*) &noseHooverVelocities[0], sizeof(double)*numBeads);
+        chainPositions.push_back(noseHooverPositions);
+        chainVelocities.push_back(noseHooverVelocities);
+    }
+}
+
 ReferenceIntegrateLangevinStepKernel::~ReferenceIntegrateLangevinStepKernel() {
    if (dynamics)
        delete dynamics;
@@ -2512,195 +2691,6 @@ void ReferenceApplyAndersenThermostatKernel::execute(ContextImpl& context) {
        context.getIntegrator().getStepSize());
 }

-ReferenceNoseHooverChainKernel::~ReferenceNoseHooverChainKernel() {
-    if (chainPropagator)
-        delete chainPropagator;
-}
-
-void ReferenceNoseHooverChainKernel::initialize() {
-    this->chainPropagator = new ReferenceNoseHooverChain();
-    //SimTKOpenMMUtilities::setRandomNumberSeed((unsigned int) thermostat.getRandomNumberSeed());
-}
-
-std::pair<double, double> ReferenceNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergy, double timeStep) {
-    double absKE = kineticEnergy.first;
-    double relKE = kineticEnergy.second;
-    if (absKE < 1e-8) return {1.0, 1.0};  // (catches the problem of zero velocities in the first dynamics step, where we have nothing to scale)
-    // Get the variables describing the NHC
-    int chainLength = nhc.getChainLength();
-    int chainID = nhc.getChainID();
-    int numDOFs = nhc.getNumDegreesOfFreedom();
-    int numMTS = nhc.getNumMultiTimeSteps();
-
-    // Get the state of the NHC from the context
-    auto& allChainPositions = extractNoseHooverPositions(context);
-    auto& allChainVelocities = extractNoseHooverVelocities(context);
-
-    int nAtoms = nhc.getThermostatedAtoms().size();
-    double absScale = 0;
-    if (nAtoms) {
-        if (allChainPositions.size() < 2*chainID+1){
-            allChainPositions.resize(2*chainID+1);
-        }
-        if (allChainVelocities.size() < 2*chainID+1){
-            allChainVelocities.resize(2*chainID+1);
-        }
-        auto& chainPositions = allChainPositions.at(2*chainID);
-        auto& chainVelocities = allChainVelocities.at(2*chainID);
-        if (chainPositions.size() < chainLength){
-            chainPositions.resize(chainLength, 0);
-        }
-        if (chainVelocities.size() < chainLength){
-            chainVelocities.resize(chainLength, 0);
-        }
-        double temperature = nhc.getTemperature();
-        double collisionFrequency = nhc.getCollisionFrequency();
-        absScale = chainPropagator->propagate(absKE, chainVelocities, chainPositions, numDOFs,
-                                              temperature, collisionFrequency, timeStep,
-                                              numMTS, nhc.getYoshidaSuzukiWeights());
-    }
-    double relScale = 0;
-    int nPairs = nhc.getThermostatedPairs().size();
-    if (nPairs) {
-        if (allChainPositions.size() < 2*chainID+2){
-            allChainPositions.resize(2*chainID+2);
-        }
-        if (allChainVelocities.size() < 2*chainID+2){
-            allChainVelocities.resize(2*chainID+2);
-        }
-        auto& chainPositions = allChainPositions.at(2*chainID+1);
-        auto& chainVelocities = allChainVelocities.at(2*chainID+1);
-        if (chainPositions.size() < chainLength){
-            chainPositions.resize(chainLength, 0);
-        }
-        if (chainVelocities.size() < chainLength){
-            chainVelocities.resize(chainLength, 0);
-        }
-        double temperature = nhc.getRelativeTemperature();
-        double collisionFrequency = nhc.getRelativeCollisionFrequency();
-        relScale = chainPropagator->propagate(relKE, chainVelocities, chainPositions, 3*nPairs,
-                                              temperature, collisionFrequency, timeStep,
-                                              numMTS, nhc.getYoshidaSuzukiWeights());
-    }
-    return {absScale, relScale};
-}
-
-double ReferenceNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
-    double potentialEnergy = 0;
-    double kineticEnergy = 0;
-    int chainLength = nhc.getChainLength();
-    int chainID = nhc.getChainID();
-    int nAtoms = nhc.getThermostatedAtoms().size();
-    int nPairs = nhc.getThermostatedPairs().size();
-    auto& nhcPositions = extractNoseHooverPositions(context);
-    auto& nhcVelocities = extractNoseHooverVelocities(context);
-    if (nAtoms) {
-        double temperature = nhc.getTemperature();
-        double collisionFrequency = nhc.getCollisionFrequency();
-        double kT = temperature * BOLTZ;
-        int numDOFs = nhc.getNumDegreesOfFreedom();
-        for(int i = 0; i < chainLength; ++i) {
-            double prefac = i ? 1 : numDOFs;
-            double mass = prefac * kT / (collisionFrequency * collisionFrequency);
-            double velocity = nhcVelocities[2*chainID][i];
-            // The kinetic energy of this bead
-            kineticEnergy += 0.5 * mass * velocity * velocity;
-            // The potential energy of this bead
-            double position = nhcPositions[2*chainID][i];
-            potentialEnergy += prefac * kT * position;
-        }
-    }
-    if (nPairs) {
-        double temperature = nhc.getRelativeTemperature();
-        double collisionFrequency = nhc.getRelativeCollisionFrequency();
-        double kT = temperature * BOLTZ;
-        int numDOFs = 3 * nPairs;
-        for(int i = 0; i < chainLength; ++i) {
-            double prefac = i ? 1 : numDOFs;
-            double mass = prefac * kT / (collisionFrequency * collisionFrequency);
-            double velocity = nhcVelocities[2*chainID+1][i];
-            // The kinetic energy of this bead
-            kineticEnergy += 0.5 * mass * velocity * velocity;
-            // The potential energy of this bead
-            double position = nhcPositions[2*chainID+1][i];
-            potentialEnergy += prefac * kT * position;
-        }
-    }
-    return kineticEnergy + potentialEnergy;
-}
-
-std::pair<double, double> ReferenceNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue) {
-    const std::vector<int>& atomsList = noseHooverChain.getThermostatedAtoms();
-    const std::vector<std::pair<int,int>>& pairsList = noseHooverChain.getThermostatedPairs();
-    std::vector<Vec3>& velocities = extractVelocities(context);
-    const System& system = context.getSystem();
-    int numParticles = system.getNumParticles();
-    std::vector<double> masses(numParticles);
-    for (int i = 0; i < numParticles; ++i)
-        masses[i] = system.getParticleMass(i);
-
-    double comKE = 0;
-    double relKE = 0;
-    // kinetic energy of individual atoms
-    for (const auto &m: atomsList){
-        comKE += 0.5 * masses[m] * velocities[m].dot(velocities[m]);
-    }
-    // center of mass kinetic energy of pairs
-    for (const auto &p: pairsList){
-        double m1 = masses[p.first];
-        double m2 = masses[p.second];
-        Vec3 v1 = velocities[p.first];
-        Vec3 v2 = velocities[p.second];
-        double invMass = 1.0 / (m1 + m2);
-        double redMass = m1 * m2 * invMass;
-        double fracM1 = m1 * invMass;
-        double fracM2 = m2 * invMass;
-        Vec3 comVelocity = fracM1 * v1 + fracM2 * v2;
-        Vec3 relVelocity = v2 - v1;
-
-        comKE += 0.5 * (m1 + m2) * comVelocity.dot(comVelocity);
-        relKE += 0.5 * redMass * relVelocity.dot(relVelocity);
-    }
-    // We ignore the downloadValue argument here and always return the correct value
-    return {comKE, relKE};
-}
-
-
-void ReferenceNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors) {
-    const auto& atoms = noseHooverChain.getThermostatedAtoms();
-    const auto& pairs = noseHooverChain.getThermostatedPairs();
-    std::vector<Vec3>& velocities = extractVelocities(context);
-    double absScale = scaleFactors.first;
-    double relScale = scaleFactors.second;
-
-    const System& system = context.getSystem();
-    int numParticles = system.getNumParticles();
-    std::vector<double> masses(numParticles);
-    for (int i = 0; i < numParticles; ++i)
-        masses[i] = system.getParticleMass(i);
-    // scale absolute velocities
-    for (const auto &a: atoms){
-        velocities[a] *= absScale;
-    }
-    // scale relative velocities and absolute center of mass velocities for each pair
-    for (const auto &p: pairs){
-        int p1 = p.first;
-        int p2 = p.second;
-        double m1 = masses[p.first];
-        double m2 = masses[p.second];
-        Vec3 v1 = velocities[p.first];
-        Vec3 v2 = velocities[p.second];
-        double invMass = 1.0 / (m1 + m2);
-        double fracM1 = m1 * invMass;
-        double fracM2 = m2 * invMass;
-        Vec3 comVelocity = fracM1 * v1 + fracM2 * v2;
-        Vec3 relVelocity = v2 - v1;
-        velocities[p1] = absScale * comVelocity - relScale * relVelocity * fracM2;
-        velocities[p2] = absScale * comVelocity + relScale * relVelocity * fracM1;
-    }
-}
-
-
 ReferenceApplyMonteCarloBarostatKernel::~ReferenceApplyMonteCarloBarostatKernel() {
    if (barostat)
        delete barostat;

--- a/platforms/reference/src/ReferencePlatform.cpp
+++ b/platforms/reference/src/ReferencePlatform.cpp
@@ -66,8 +66,7 @@ ReferencePlatform::ReferencePlatform() {
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateVelocityVerletStepKernel::Name(), factory);
-    registerKernelFactory(NoseHooverChainKernel::Name(), factory);
+    registerKernelFactory(IntegrateNoseHooverStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
@@ -104,8 +103,6 @@ ReferencePlatform::PlatformData::PlatformData(const System& system) : time(0.0),
    periodicBoxVectors = new Vec3[3];
    constraints = new ReferenceConstraints(system);
    energyParameterDerivatives = new map<string, double>();
-    noseHooverPositions = new vector<vector<double> >();
-    noseHooverVelocities = new vector<vector<double> >();
 }

 ReferencePlatform::PlatformData::~PlatformData() {
@@ -116,6 +113,4 @@ ReferencePlatform::PlatformData::~PlatformData() {
    delete[] periodicBoxVectors;
    delete constraints;
    delete energyParameterDerivatives;
-    delete noseHooverPositions;
-    delete noseHooverVelocities;
 }
--- a/platforms/reference/src/SimTKReference/ReferenceCMAPTorsionIxn.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceCMAPTorsionIxn.cpp

-/* Portions copyright (c) 2010-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2010-2020 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -133,8 +133,8 @@ void ReferenceCMAPTorsionIxn::calculateOneIxn(int index, vector<Vec3>& atomCoord

    int size = (int) sqrt(coeff[map].size());
    double delta = 2*M_PI/size;
-    int s = (int) (angleA/delta);
-    int t = (int) (angleB/delta);
+    int s = (int) fmin(angleA/delta, size-1);
+    int t = (int) fmin(angleB/delta, size-1);
    const vector<double>& c = coeff[map][s+size*t];
    double da = angleA/delta-s;
    double db = angleB/delta-t;

--- a/platforms/reference/src/SimTKReference/ReferenceNoseHooverChain.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceNoseHooverChain.cpp
@@ -68,27 +68,27 @@ double ReferenceNoseHooverChain::propagate(double kineticEnergy, vector<double>&
     for (int mts = 0; mts < numMTS; ++mts) {
         for (const auto &ys : YSWeights) {
             double wdt = ys * timeStep / numMTS;
-             chainVelocities.back() += 0.25 * wdt * chainForces.back();
+             chainVelocities.back() += 0.5 * wdt * chainForces.back();
             for (int bead = chainLength - 2; bead >= 0; --bead) {
-                 double aa = exp(-0.125 * wdt * chainVelocities[bead + 1]);
-                 chainVelocities[bead] = aa * (chainVelocities[bead] * aa + 0.25 * wdt * chainForces[bead]);
+                 double aa = exp(-0.25 * wdt * chainVelocities[bead + 1]);
+                 chainVelocities[bead] = aa * (chainVelocities[bead] * aa + 0.5 * wdt * chainForces[bead]);
             }
             // update particle velocities
-             double aa = exp(-0.5 * wdt * chainVelocities[0]);
+             double aa = exp(-wdt * chainVelocities[0]);
             scale *= aa;
             // update the thermostat positions
             for (int bead = 0; bead < chainLength; ++bead) {
-                 chainPositions[bead] += 0.5 * chainVelocities[bead] * wdt;
+                 chainPositions[bead] += chainVelocities[bead] * wdt;
             }
             // update the forces
             chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
             // update thermostat velocities
             for (int bead = 0; bead < chainLength - 1; ++bead) {
-                 double aa = exp(-0.125 * wdt * chainVelocities[bead + 1]);
-                 chainVelocities[bead] = aa * (aa * chainVelocities[bead] + 0.25 * wdt * chainForces[bead]);
+                 double aa = exp(-0.25 * wdt * chainVelocities[bead + 1]);
+                 chainVelocities[bead] = aa * (aa * chainVelocities[bead] + 0.5 * wdt * chainForces[bead]);
                 chainForces[bead + 1] = (chainMasses[bead] * chainVelocities[bead] * chainVelocities[bead] - kT) / chainMasses[bead + 1];
             }
-             chainVelocities[chainLength-1] += 0.25 * wdt * chainForces.back();
+             chainVelocities[chainLength-1] += 0.5 * wdt * chainForces.back();
         }  // YS loop
     } // MTS loop
     return scale;

--- a/platforms/reference/src/SimTKReference/ReferenceVelocityVerletDynamics.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceVelocityVerletDynamics.cpp

-/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
- * Contributors: Peter Eastman, Pande Group
+/* Portions copyright (c) 2006-2020 Stanford University and Simbios.
+ * Contributors: Andy Simmonett, Peter Eastman, Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@@ -29,7 +29,7 @@
 #include "openmm/OpenMMException.h"
 #include "SimTKOpenMMUtilities.h"
 #include "openmm/internal/ContextImpl.h"
-#include "ReferenceVelocityVerletDynamics.h"
+#include "ReferenceNoseHooverDynamics.h"
 #include "ReferenceVirtualSites.h"

 #include <cstdio>
@@ -37,49 +37,18 @@
 using std::vector;
 using namespace OpenMM;

-/**---------------------------------------------------------------------------------------
-
-   ReferenceVelocityVerletDynamics constructor
-
-   @param numberOfAtoms  number of atoms
-   @param deltaT         delta t for dynamics
-   @param friction       friction coefficient
-   @param temperature    temperature
-
-   --------------------------------------------------------------------------------------- */
-
-ReferenceVelocityVerletDynamics::ReferenceVelocityVerletDynamics(int numberOfAtoms, double deltaT) :
-           ReferenceDynamics(numberOfAtoms, deltaT, 0.0) {
+ReferenceNoseHooverDynamics::ReferenceNoseHooverDynamics(int numberOfAtomsIn, double deltaT) :
+           ReferenceDynamics(numberOfAtomsIn, deltaT, 0.0) {
+   numberOfAtoms = numberOfAtomsIn;
   xPrime.resize(numberOfAtoms);
   inverseMasses.resize(numberOfAtoms);
+   oldx.resize(numberOfAtoms);
 }

-/**---------------------------------------------------------------------------------------
-
-   ReferenceVelocityVerletDynamics destructor
-
-   --------------------------------------------------------------------------------------- */
-
-ReferenceVelocityVerletDynamics::~ReferenceVelocityVerletDynamics() {
+ReferenceNoseHooverDynamics::~ReferenceNoseHooverDynamics() {
 }

-/**---------------------------------------------------------------------------------------
-
-   Update -- driver routine for performing Velocity Verlet dynamics update of coordinates
-   and velocities
-
-   @param system              the System to be integrated
-   @param atomCoordinates     atom coordinates
-   @param velocities          velocities
-   @param forces              forces
-   @param masses              atom masses
-   @param atomList            list of all atoms not involved in a Drude-like pair
-   @param pairList            list of all Drude-like pairs
-   @param maxPairDistance     the maximum separation of any Drude-like pairs
-
-   --------------------------------------------------------------------------------------- */
-
-void ReferenceVelocityVerletDynamics::update(OpenMM::ContextImpl &context, const OpenMM::System& system, vector<Vec3>& atomCoordinates,
+void ReferenceNoseHooverDynamics::step1(OpenMM::ContextImpl &context, const OpenMM::System& system, vector<Vec3>& atomCoordinates,
                                          vector<Vec3>& velocities,
                                          vector<Vec3>& forces, vector<double>& masses, double tolerance, bool &forcesAreValid,
                                          const std::vector<int> & atomList, const std::vector<std::tuple<int, int, double>> &pairList,
@@ -88,10 +57,8 @@ void ReferenceVelocityVerletDynamics::update(OpenMM::ContextImpl &context, const
    // first-time-through initialization
    if (!forcesAreValid) context.calcForcesAndEnergy(true, false);

-    int numberOfAtoms = system.getNumParticles();
    if (getTimeStep() == 0) {
       // invert masses
-
       for (int ii = 0; ii < numberOfAtoms; ii++) {
          if (masses[ii] == 0.0)
              inverseMasses[ii] = 0.0;
@@ -100,22 +67,20 @@ void ReferenceVelocityVerletDynamics::update(OpenMM::ContextImpl &context, const
       }
    }

-    //// Perform the integration.

+    const double halfdt = 0.5*getDeltaT();
    // Regular atoms
    for (const auto &atom : atomList) {
        if (masses[atom] != 0.0) {
-            velocities[atom] += 0.5 * inverseMasses[atom]*forces[atom]*getDeltaT();
-            xPrime[atom] = atomCoordinates[atom];
-            atomCoordinates[atom] += velocities[atom]*getDeltaT();
+            velocities[atom] += inverseMasses[atom]*forces[atom]*getDeltaT();
        }
    }
    // Connected particles
    for (const auto &pair : pairList) {
        const auto &atom1 = std::get<0>(pair);
        const auto &atom2 = std::get<1>(pair);
-        double m1 = system.getParticleMass(atom1);
-        double m2 = system.getParticleMass(atom2);
+        double m1 = masses[atom1];
+        double m2 = masses[atom2];
        double mass1fract = m1 / (m1 + m2);
        double mass2fract = m2 / (m1 + m2);
        double invRedMass = (m1 * m2 != 0.0) ? (m1 + m2)/(m1 * m2) : 0.0;
@@ -124,26 +89,52 @@ void ReferenceVelocityVerletDynamics::update(OpenMM::ContextImpl &context, const
        Vec3 relVel = velocities[atom2] - velocities[atom1];
        Vec3 comForce = forces[atom1] + forces[atom2];
        Vec3 relForce = mass1fract*forces[atom2] - mass2fract*forces[atom1];
-        comVel += 0.5 * comForce * getDeltaT() * invTotMass;
-        relVel += 0.5 * relForce * getDeltaT() * invRedMass; 
+        comVel += comForce * getDeltaT() * invTotMass;
+        relVel += relForce * getDeltaT() * invRedMass; 
        if (m1 != 0.0) {
            velocities[atom1] = comVel - relVel*mass2fract;
-            xPrime[atom1] = atomCoordinates[atom1];
-            atomCoordinates[atom1] += velocities[atom1]*getDeltaT();
        }
        if (m2 != 0.0) {
            velocities[atom2] = comVel + relVel*mass1fract;
-            xPrime[atom2] = atomCoordinates[atom2];
-            atomCoordinates[atom2] += velocities[atom2]*getDeltaT();
        }
    }

-    // 
+    ReferenceConstraintAlgorithm* referenceConstraintAlgorithm = getReferenceConstraintAlgorithm();
+    if (referenceConstraintAlgorithm) {
+        referenceConstraintAlgorithm->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+    }
+
+    for (int atom = 0; atom < numberOfAtoms; ++atom) {
+        if (masses[atom] != 0.0) {
+            xPrime[atom] = atomCoordinates[atom] + velocities[atom]*halfdt;
+        }
+    }
+}
+
+
+void ReferenceNoseHooverDynamics::step2(OpenMM::ContextImpl &context, const OpenMM::System& system, vector<Vec3>& atomCoordinates,
+                                          vector<Vec3>& velocities,
+                                          vector<Vec3>& forces, vector<double>& masses, double tolerance, bool &forcesAreValid,
+                                          const std::vector<int> & atomList, const std::vector<std::tuple<int, int, double>> &pairList,
+                                          double maxPairDistance) {
+    const double halfdt = 0.5*getDeltaT();
+    for (int atom = 0; atom < numberOfAtoms; ++atom) {
+        if (masses[atom] != 0.0) {
+            xPrime[atom] += velocities[atom]*halfdt;
+            oldx[atom] = xPrime[atom];
+        }
+    }

    ReferenceConstraintAlgorithm* referenceConstraintAlgorithm = getReferenceConstraintAlgorithm();
    if (referenceConstraintAlgorithm)
-       referenceConstraintAlgorithm->apply(xPrime, atomCoordinates, inverseMasses, tolerance);
+        referenceConstraintAlgorithm->apply(atomCoordinates, xPrime, inverseMasses, tolerance);

+    for (int i = 0; i < numberOfAtoms; i++) {
+        if (inverseMasses[i] != 0.0) {
+            velocities[i] += (xPrime[i]-oldx[i])/getDeltaT();
+            atomCoordinates[i] = xPrime[i];
+        }
+    }

    // Apply hard wall constraints.
    if (maxPairDistance > 0) {
@@ -157,9 +148,6 @@ void ReferenceVelocityVerletDynamics::update(OpenMM::ContextImpl &context, const
            if (rInv*maxPairDistance < 1.0) {
                // The constraint has been violated, so make the inter-particle distance "bounce"
                // off the hard wall.
-                //if (rInv*maxPairDistance < 0.5)
-                //    throw OpenMMException("Drude particle moved too far beyond hard wall constraint");
-                //    TODO: Review this - I commented it out to make the NoseHooverThermostat test work
                Vec3 bondDir = delta*rInv;
                Vec3 vel1 = velocities[atom1];
                Vec3 vel2 = velocities[atom2];
@@ -213,50 +201,7 @@ void ReferenceVelocityVerletDynamics::update(OpenMM::ContextImpl &context, const
        }
    } /* end of hard wall constraint part */

-
-    ReferenceVirtualSites::computePositions(system, atomCoordinates);
-    context.calcForcesAndEnergy(true, false);
-    forcesAreValid = true;
-
-    for (int i = 0; i < numberOfAtoms; ++i) {
-        if (masses[i] != 0.0)
-            for (int j = 0; j < 3; ++j) {
-                xPrime[i][j] += velocities[i][j]*getDeltaT();
-        }
-   } 
-
-    // Update the positions and velocities.
-    // Regular atoms
-    for (const auto &atom : atomList) {
-        if (masses[atom] != 0.0) {
-            velocities[atom] += 0.5 * inverseMasses[atom]*forces[atom]*getDeltaT() + (atomCoordinates[atom] - xPrime[atom])/getDeltaT();
-        }
-    }
-    // Connected particles
-    for (const auto &pair : pairList) {
-        const auto &atom1 = std::get<0>(pair);
-        const auto &atom2 = std::get<1>(pair);
-        double m1 = system.getParticleMass(atom1);
-        double m2 = system.getParticleMass(atom2);
-        double mass1fract = m1 / (m1 + m2);
-        double mass2fract = m2 / (m1 + m2);
-        double invRedMass = (m1 * m2 != 0.0) ? (m1 + m2)/(m1 * m2) : 0.0;
-        double invTotMass = (m1 + m2 != 0.0) ? 1.0 /(m1 + m2) : 0.0;
-        Vec3 comVel = velocities[atom1]*mass1fract + velocities[atom2]*mass2fract;
-        Vec3 relVel = velocities[atom2] - velocities[atom1];
-        Vec3 comForce = forces[atom1] + forces[atom2];
-        Vec3 relForce = mass1fract*forces[atom2] - mass2fract*forces[atom1];
-        comVel += 0.5 * comForce * getDeltaT() * invTotMass;
-        relVel += 0.5 * relForce * getDeltaT() * invRedMass; 
-        if (m1 != 0.0) {
-            velocities[atom1] = comVel - relVel*mass2fract + (atomCoordinates[atom1] - xPrime[atom1])/getDeltaT();
-        }
-        if (m2 != 0.0) {
-            velocities[atom2] = comVel + relVel*mass1fract + (atomCoordinates[atom2] - xPrime[atom2])/getDeltaT();
-        }
-    }
-    if (referenceConstraintAlgorithm)
-       referenceConstraintAlgorithm->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+    ReferenceVirtualSites::computePositions(context.getSystem(), atomCoordinates);

    incrementTimeStep();
 }
--- a/platforms/reference/tests/TestReferenceNoseHooverThermostat.cpp
+++ b/platforms/reference/tests/TestReferenceNoseHooverThermostat.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2019 Stanford University and the Authors.           *
- * Authors: Andreas Krämer and Andrew C. Simmonett                            *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include "ReferenceTests.h"
-#include "TestNoseHooverThermostat.h"
-
-void runPlatformTests() {
-}
--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -66,96 +66,102 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri
    float posInBox[4] = {0,0,0,0};
    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);

-    int i = threadIndex;
+    const int groupSize = max(1, numParticles / (10 * numThreads));
+    int start = groupSize * threadIndex;
    while (true) {
        if (!deterministic)
-            i = atomicCounter++;
-        if (i >= numParticles)
+            start = atomicCounter.fetch_add(groupSize);
+
+        if (start >= numParticles)
            break;

-        // Find the position relative to the nearest grid point.
-
-        fvec4 pos(&posq[4*i]);
-        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
-        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
-        t = (t-floor(t))*gridSize;
-        ivec4 ti = t;
-        fvec4 dr = t-ti;
-        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
-        
-        // Compute the B-spline coefficients.
-
-        fvec4 data[PME_ORDER];
-        data[PME_ORDER-1] = 0.0f;
-        data[1] = dr;
-        data[0] = one-dr;
-        for (int j = 3; j < PME_ORDER; j++) {
-            fvec4 div(1.0f/(j-1));
-            data[j-1] = div*dr*data[j-2];
-            for (int k = 1; k < j-1; k++)
-                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
-            data[0] = div*(one-dr)*data[0];
-        }
-        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
-        data[0] = scale*(one-dr)*data[0];
-        
-        // Spread the charges.
-        
-        int gridIndexX = gridIndex[0];
-        int gridIndexY = gridIndex[1];
-        int gridIndexZ = gridIndex[2];
-        if (gridIndexX < 0)
-            return; // This happens when a simulation blows up and coordinates become NaN.
-        int zindex[PME_ORDER];
-        for (int j = 0; j < PME_ORDER; j++) {
-            zindex[j] = gridIndexZ+j;
-            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
-        }
-        float charge = epsilonFactor*posq[4*i+3];
-        fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
-        float zdata4 = data[4][2];
-        if (gridIndexZ+4 < gridz) {
-            for (int ix = 0; ix < PME_ORDER; ix++) {
-                int xbase = gridIndexX+ix;
-                xbase -= (xbase >= gridx ? gridx : 0);
-                xbase = xbase*gridy*gridz;
-                float xdata = charge*data[ix][0];
-                for (int iy = 0; iy < PME_ORDER; iy++) {
-                    int ybase = gridIndexY+iy;
-                    ybase -= (ybase >= gridy ? gridy : 0);
-                    ybase = xbase + ybase*gridz;
-                    float multiplier = xdata*data[iy][1];
-                    fvec4 add0to3 = zdata0to3*multiplier;
-                    (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
-                    grid[ybase+zindex[4]] += multiplier*zdata4;
+        int end = min(start + groupSize, numParticles);
+        for (int i = start; i < end; ++i) {
+            // Find the position relative to the nearest grid point.
+
+            fvec4 pos(&posq[4*i]);
+            (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
+            fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
+            t = (t-floor(t))*gridSize;
+            ivec4 ti = t;
+            fvec4 dr = t-ti;
+            ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
+
+            // Compute the B-spline coefficients.
+
+            fvec4 data[PME_ORDER];
+            data[PME_ORDER-1] = 0.0f;
+            data[1] = dr;
+            data[0] = one-dr;
+            for (int j = 3; j < PME_ORDER; j++) {
+                fvec4 div(1.0f/(j-1));
+                data[j-1] = div*dr*data[j-2];
+                for (int k = 1; k < j-1; k++)
+                    data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
+                data[0] = div*(one-dr)*data[0];
+            }
+            data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+            for (int j = 1; j < (PME_ORDER-1); j++)
+                data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
+            data[0] = scale*(one-dr)*data[0];
+
+            // Spread the charges.
+
+            int gridIndexX = gridIndex[0];
+            int gridIndexY = gridIndex[1];
+            int gridIndexZ = gridIndex[2];
+            if (gridIndexX < 0)
+                return; // This happens when a simulation blows up and coordinates become NaN.
+            int zindex[PME_ORDER];
+            for (int j = 0; j < PME_ORDER; j++) {
+                zindex[j] = gridIndexZ+j;
+                zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
+            }
+            float charge = epsilonFactor*posq[4*i+3];
+            fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
+            float zdata4 = data[4][2];
+            if (gridIndexZ+4 < gridz) {
+                for (int ix = 0; ix < PME_ORDER; ix++) {
+                    int xbase = gridIndexX+ix;
+                    xbase -= (xbase >= gridx ? gridx : 0);
+                    xbase = xbase*gridy*gridz;
+                    float xdata = charge*data[ix][0];
+                    for (int iy = 0; iy < PME_ORDER; iy++) {
+                        int ybase = gridIndexY+iy;
+                        ybase -= (ybase >= gridy ? gridy : 0);
+                        ybase = xbase + ybase*gridz;
+                        float multiplier = xdata*data[iy][1];
+                        fvec4 add0to3 = zdata0to3*multiplier;
+                        (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
+                        grid[ybase+zindex[4]] += multiplier*zdata4;
+                    }
                }
            }
-        }
-        else {
-            for (int ix = 0; ix < PME_ORDER; ix++) {
-                int xbase = gridIndexX+ix;
-                xbase -= (xbase >= gridx ? gridx : 0);
-                xbase = xbase*gridy*gridz;
-                float xdata = charge*data[ix][0];
-                for (int iy = 0; iy < PME_ORDER; iy++) {
-                    int ybase = gridIndexY+iy;
-                    ybase -= (ybase >= gridy ? gridy : 0);
-                    ybase = xbase + ybase*gridz;
-                    float multiplier = xdata*data[iy][1];
-                    fvec4 add0to3 = zdata0to3*multiplier;
-                    add0to3.store(temp);
-                    grid[ybase+zindex[0]] += temp[0];
-                    grid[ybase+zindex[1]] += temp[1];
-                    grid[ybase+zindex[2]] += temp[2];
-                    grid[ybase+zindex[3]] += temp[3];
-                    grid[ybase+zindex[4]] += multiplier*zdata4;
+            else {
+                for (int ix = 0; ix < PME_ORDER; ix++) {
+                    int xbase = gridIndexX+ix;
+                    xbase -= (xbase >= gridx ? gridx : 0);
+                    xbase = xbase*gridy*gridz;
+                    float xdata = charge*data[ix][0];
+                    for (int iy = 0; iy < PME_ORDER; iy++) {
+                        int ybase = gridIndexY+iy;
+                        ybase -= (ybase >= gridy ? gridy : 0);
+                        ybase = xbase + ybase*gridz;
+                        float multiplier = xdata*data[iy][1];
+                        fvec4 add0to3 = zdata0to3*multiplier;
+                        add0to3.store(temp);
+                        grid[ybase+zindex[0]] += temp[0];
+                        grid[ybase+zindex[1]] += temp[1];
+                        grid[ybase+zindex[2]] += temp[2];
+                        grid[ybase+zindex[3]] += temp[3];
+                        grid[ybase+zindex[4]] += multiplier*zdata4;
+                    }
                }
            }
        }
+
        if (deterministic)
-            i += numThreads;
+            start += groupSize * numThreads;
    }
 }

@@ -310,7 +316,7 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vecto
    }
 }

-static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor) {
+static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor, int numThreads) {
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
@@ -320,88 +326,94 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx,
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
+
+    const int groupSize = max(1, numParticles / (10 * numThreads));
    while (true) {
-        int i = atomicCounter++;
-        if (i >= numParticles)
+        int start = atomicCounter.fetch_add(groupSize);
+        if (start >= numParticles)
            break;

-        // Find the position relative to the nearest grid point.
-        
-        fvec4 pos(&posq[4*i]);
-        float posInBox[4];
-        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
-        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
-        t = (t-floor(t))*gridSize;
-        ivec4 ti = t;
-        fvec4 dr = t-ti;
-        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
-        
-        // Compute the B-spline coefficients.
-        
-        fvec4 data[PME_ORDER];
-        fvec4 ddata[PME_ORDER];
-        data[PME_ORDER-1] = 0.0f;
-        data[1] = dr;
-        data[0] = one-dr;
-        for (int j = 3; j < PME_ORDER; j++) {
-            fvec4 div(1.0f/(j-1));
-            data[j-1] = div*dr*data[j-2];
-            for (int k = 1; k < j-1; k++)
-                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
-            data[0] = div*(one-dr)*data[0];
-        }
-        ddata[0] = -data[0];
-        for (int j = 1; j < PME_ORDER; j++)
-            ddata[j] = data[j-1]-data[j];
-        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
-        data[0] = scale*(one-dr)*data[0];
-                
-        // Compute the force on this atom.
-        
-        int gridIndexX = gridIndex[0];
-        int gridIndexY = gridIndex[1];
-        int gridIndexZ = gridIndex[2];
-        if (gridIndexX < 0)
-            return; // This happens when a simulation blows up and coordinates become NaN.
-        int zindex[PME_ORDER];
-        for (int j = 0; j < PME_ORDER; j++) {
-            zindex[j] = gridIndexZ+j;
-            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
-        }
-        fvec4 zdata[PME_ORDER];
-        for (int j = 0; j < PME_ORDER; j++)
-            zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
-        fvec4 f = 0.0f;
-        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xbase = gridIndexX+ix;
-            xbase -= (xbase >= gridx ? gridx : 0);
-            xbase = xbase*gridy*gridz;
-            float dx = data[ix][0];
-            float ddx = ddata[ix][0];
-            fvec4 xdata(ddx, dx, dx, 0);
-
-            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int ybase = gridIndexY+iy;
-                ybase -= (ybase >= gridy ? gridy : 0);
-                ybase = xbase + ybase*gridz;
-                float dy = data[iy][1];
-                float ddy = ddata[iy][1];
-                fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
-
-                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    fvec4 gridValue(grid[ybase+zindex[iz]]);
-                    f = f+xydata*zdata[iz]*gridValue;
+        int end = min(start + groupSize, numParticles);
+
+        for (int i = start; i < end; i++) {
+            // Find the position relative to the nearest grid point.
+
+            fvec4 pos(&posq[4*i]);
+            float posInBox[4];
+            (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
+            fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
+            t = (t-floor(t))*gridSize;
+            ivec4 ti = t;
+            fvec4 dr = t-ti;
+            ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
+
+            // Compute the B-spline coefficients.
+
+            fvec4 data[PME_ORDER];
+            fvec4 ddata[PME_ORDER];
+            data[PME_ORDER-1] = 0.0f;
+            data[1] = dr;
+            data[0] = one-dr;
+            for (int j = 3; j < PME_ORDER; j++) {
+                fvec4 div(1.0f/(j-1));
+                data[j-1] = div*dr*data[j-2];
+                for (int k = 1; k < j-1; k++)
+                    data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
+                data[0] = div*(one-dr)*data[0];
+            }
+            ddata[0] = -data[0];
+            for (int j = 1; j < PME_ORDER; j++)
+                ddata[j] = data[j-1]-data[j];
+            data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+            for (int j = 1; j < (PME_ORDER-1); j++)
+                data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
+            data[0] = scale*(one-dr)*data[0];
+
+            // Compute the force on this atom.
+
+            int gridIndexX = gridIndex[0];
+            int gridIndexY = gridIndex[1];
+            int gridIndexZ = gridIndex[2];
+            if (gridIndexX < 0)
+                return; // This happens when a simulation blows up and coordinates become NaN.
+            int zindex[PME_ORDER];
+            for (int j = 0; j < PME_ORDER; j++) {
+                zindex[j] = gridIndexZ+j;
+                zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
+            }
+            fvec4 zdata[PME_ORDER];
+            for (int j = 0; j < PME_ORDER; j++)
+                zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
+            fvec4 f = 0.0f;
+            for (int ix = 0; ix < PME_ORDER; ix++) {
+                int xbase = gridIndexX+ix;
+                xbase -= (xbase >= gridx ? gridx : 0);
+                xbase = xbase*gridy*gridz;
+                float dx = data[ix][0];
+                float ddx = ddata[ix][0];
+                fvec4 xdata(ddx, dx, dx, 0);
+
+                for (int iy = 0; iy < PME_ORDER; iy++) {
+                    int ybase = gridIndexY+iy;
+                    ybase -= (ybase >= gridy ? gridy : 0);
+                    ybase = xbase + ybase*gridz;
+                    float dy = data[iy][1];
+                    float ddy = ddata[iy][1];
+                    fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
+
+                    for (int iz = 0; iz < PME_ORDER; iz++) {
+                        fvec4 gridValue(grid[ybase+zindex[iz]]);
+                        f = f+xydata*zdata[iz]*gridValue;
+                    }
                }
            }
+            f *= -epsilonFactor*posq[4*i+3];
+            float fc[4];
+            f.store(fc);
+            force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
+            force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
+            force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
        }
-        f *= -epsilonFactor*posq[4*i+3];
-        float fc[4];
-        f.store(fc);
-        force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
-        force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
-        force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
    }
 }

@@ -606,7 +618,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
    }
    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
    threads.syncThreads();
-    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
+    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
 }

 void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
@@ -900,7 +912,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& thre
    complexStart = (index*complexSize)/numThreads;
    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
    threads.syncThreads();
-    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
+    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
 }

 void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(CalcPmeReciprocalForceKernel::IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {

--- a/plugins/drude/openmmapi/include/openmm/DrudeNoseHooverIntegrator.h
+++ b/plugins/drude/openmmapi/include/openmm/DrudeNoseHooverIntegrator.h
@@ -66,7 +66,7 @@ public:
     */
    DrudeNoseHooverIntegrator(double temperature, double collisionFrequency, 
                              double drudeTemperature, double drudeCollisionFrequency, double stepSize, 
-                              int chainLength = 3, int numMTS = 3, int numYoshidaSuzuki = 3);
+                              int chainLength = 3, int numMTS = 3, int numYoshidaSuzuki = 7);

    virtual ~DrudeNoseHooverIntegrator();
    /**

--- a/plugins/drude/openmmapi/src/DrudeNoseHooverIntegrator.cpp
+++ b/plugins/drude/openmmapi/src/DrudeNoseHooverIntegrator.cpp
@@ -82,10 +82,8 @@ void DrudeNoseHooverIntegrator::initialize(ContextImpl& contextRef) {

    context = &contextRef;
    owner = &contextRef.getOwner();
-    vvKernel = context->getPlatform().createKernel(IntegrateVelocityVerletStepKernel::Name(), contextRef);
-    vvKernel.getAs<IntegrateVelocityVerletStepKernel>().initialize(contextRef.getSystem(), *this);
-    nhcKernel = context->getPlatform().createKernel(NoseHooverChainKernel::Name(), contextRef);
-    nhcKernel.getAs<NoseHooverChainKernel>().initialize();
+    kernel = context->getPlatform().createKernel(IntegrateNoseHooverStepKernel::Name(), contextRef);
+    kernel.getAs<IntegrateNoseHooverStepKernel>().initialize(contextRef.getSystem(), *this);
    forcesAreValid = false;

    // check for drude particles and build the Nose-Hoover Chains
@@ -138,14 +136,14 @@ double DrudeNoseHooverIntegrator::computeDrudeKineticEnergy() {
    double kE = 0.0;
    for (const auto &nhc: noseHooverChains){
        if (nhc.getThermostatedPairs().size() != 0) {
-            kE += nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, true).second;
+            kE += kernel.getAs<IntegrateNoseHooverStepKernel>().computeMaskedKineticEnergy(*context, nhc, true).second;
        }
    }
    return kE;
 }

 double DrudeNoseHooverIntegrator::computeTotalKineticEnergy() {
-    return vvKernel.getAs<IntegrateVelocityVerletStepKernel>().computeKineticEnergy(*context, *this);
+    return kernel.getAs<IntegrateNoseHooverStepKernel>().computeKineticEnergy(*context, *this);
 }

 std::vector<Vec3> DrudeNoseHooverIntegrator::getVelocitiesForTemperature(const System &system, double temperature,

--- a/plugins/drude/serialization/src/DrudeNoseHooverIntegratorProxy.cpp
+++ b/plugins/drude/serialization/src/DrudeNoseHooverIntegratorProxy.cpp
@@ -49,7 +49,7 @@ void DrudeNoseHooverIntegratorProxy::serialize(const void* object, Serialization
    node.setDoubleProperty("stepSize", integrator.getStepSize());
    node.setDoubleProperty("constraintTolerance", integrator.getConstraintTolerance());
    node.setDoubleProperty("maximumPairDistance", integrator.getMaximumPairDistance());
-    assert(not integrator.hasSubsystemThermostats());
+    assert(!integrator.hasSubsystemThermostats());
    node.setDoubleProperty("temperature", integrator.getTemperature());
    node.setDoubleProperty("relativeTemperature", integrator.getRelativeTemperature());
    node.setDoubleProperty("collisionFrequency", integrator.getCollisionFrequency());

--- a/plugins/drude/tests/TestDrudeNoseHoover.h
+++ b/plugins/drude/tests/TestDrudeNoseHoover.h
@@ -141,6 +141,15 @@ void testWaterBox() {
    // Equilibrate
    integ.step(1500);

+    double TOL = 1.5;
+    try {
+        if (platform.getPropertyValue(context, "Precision") != "double") {
+            TOL = 2.0;
+        }
+    } catch(OpenMMException) {
+        // The defaults above are for double precision, which is assumed in this case
+    }
+
    // Compute the internal and center of mass temperatures.
    double totalKE = 0;
    const int numSteps = 500;
@@ -162,7 +171,7 @@ void testWaterBox() {
        double conserved = PE + fullKE + heatBathEnergy;
        meanConserved = (i*meanConserved + conserved)/(i+1);
        totalKE += KE;
-        ASSERT(fabs(meanConserved - conserved) < 0.3);
+        ASSERT(fabs(meanConserved - conserved) < TOL);
    }
    totalKE /= numSteps;
    ASSERT_USUALLY_EQUAL_TOL(temperature, meanTemp,  0.03);

--- a/tests/TestCompoundIntegrator.h
+++ b/tests/TestCompoundIntegrator.h
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2015-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,6 +33,7 @@
 #include "openmm/BrownianIntegrator.h"
 #include "openmm/CompoundIntegrator.h"
 #include "openmm/Context.h"
+#include "openmm/CustomIntegrator.h"
 #include "openmm/HarmonicBondForce.h"
 #include "openmm/LangevinIntegrator.h"
 #include "openmm/System.h"
@@ -208,6 +209,36 @@ void testDifferentStepSizes() {
    }
 }

+void testCheckpoint() {
+    // Test that member integrators get loaded correctly from checkpoints.
+    System system;
+    system.addParticle(1.0);
+    CustomIntegrator* custom = new CustomIntegrator(0.001);
+    custom->addGlobalVariable("a", 1.0);
+    custom->addPerDofVariable("b", 2.0);
+    CompoundIntegrator integrator;
+    integrator.addIntegrator(custom);
+    integrator.addIntegrator(new VerletIntegrator(0.005));
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(1, Vec3());
+    context.setPositions(positions);
+    custom->setGlobalVariable(0, 5.0);
+    vector<Vec3> b1(1, Vec3(1, 2, 3));
+    custom->setPerDofVariable(0, b1);
+    stringstream checkpoint; 
+    context.createCheckpoint(checkpoint);
+    custom->setGlobalVariable(0, 10.0);
+    vector<Vec3> b2(1, Vec3(4, 5, 6));
+    custom->setPerDofVariable(0, b2);
+    integrator.setCurrentIntegrator(1);
+    context.loadCheckpoint(checkpoint);
+    ASSERT_EQUAL(0, integrator.getCurrentIntegrator());
+    ASSERT_EQUAL(5.0, custom->getGlobalVariable(0));
+    vector<Vec3> b3;
+    custom->getPerDofVariable(0, b3);
+    ASSERT_EQUAL_VEC(b1[0], b3[0], 1e-6);
+}
+
 void runPlatformTests();

 int main(int argc, char* argv[]) {
@@ -216,6 +247,7 @@ int main(int argc, char* argv[]) {
        testChangingIntegrator();
        testChangingParameters();
        testDifferentStepSizes();
+        testCheckpoint();
        runPlatformTests();
    }
    catch(const exception& e) {

--- a/tests/TestCustomIntegrator.h
+++ b/tests/TestCustomIntegrator.h
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -48,6 +48,7 @@
 #include <algorithm>
 #include <cmath>
 #include <iostream>
+#include <sstream>
 #include <vector>

 using namespace OpenMM;
@@ -1156,6 +1157,31 @@ void testInitialTemperature() {
    ASSERT_USUALLY_EQUAL_TOL(targetTemperature, temperature, 0.01);
 }

+void testCheckpoint() {
+    // Test that integrator variables get loaded correctly from checkpoints.
+    System system;
+    system.addParticle(1.0);
+    CustomIntegrator integrator(0.001);
+    integrator.addGlobalVariable("a", 1.0);
+    integrator.addPerDofVariable("b", 2.0);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(1, Vec3());
+    context.setPositions(positions);
+    integrator.setGlobalVariable(0, 5.0);
+    vector<Vec3> b1(1, Vec3(1, 2, 3));
+    integrator.setPerDofVariable(0, b1);
+    stringstream checkpoint; 
+    context.createCheckpoint(checkpoint);
+    integrator.setGlobalVariable(0, 10.0);
+    vector<Vec3> b2(1, Vec3(4, 5, 6));
+    integrator.setPerDofVariable(0, b2);
+    context.loadCheckpoint(checkpoint);
+    ASSERT_EQUAL(5.0, integrator.getGlobalVariable(0));
+    vector<Vec3> b3;
+    integrator.getPerDofVariable(0, b3);
+    ASSERT_EQUAL_VEC(b1[0], b3[0], 1e-6);
+}
+
 void runPlatformTests();

 int main(int argc, char* argv[]) {
@@ -1184,6 +1210,7 @@ int main(int argc, char* argv[]) {
        testVectorFunctions();
        testRecordEnergy();
        testInitialTemperature();
+        testCheckpoint();
        runPlatformTests();
    }
    catch(const exception& e) {