Implement NHC for OpenCL platform

98f73120 · Andy Simmonett · 80e1b66f · 98f73120 · 98f73120 · 98f73120
Unverified Commit 98f73120 authored Oct 25, 2019 by Andy Simmonett
13 changed files
--- a/platforms/opencl/include/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/include/OpenCLIntegrationUtilities.h
@@ -31,6 +31,7 @@
 #include "OpenCLContext.h"
 #include "windowsExportOpenCL.h"
 #include <iosfwd>
+#include <map>
 namespace OpenMM {
@@ -119,6 +120,12 @@ public:
     * @param timeShift   the amount by which to shift the velocities in time
     */
    double computeKineticEnergy(double timeShift);
+    /**
+     * Get the data structure that holds the state of all Nose-Hoover chains
+     * 
+     * @return  vector of chain states
+     */
+    std::map<int, OpenCLArray>& getNoseHooverChainState();
 private:
    void applyConstraints(bool constrainVelocities, double tol);
    OpenCLContext& context;
@@ -166,6 +173,7 @@ private:
    mm_double2 lastStepSize;
    struct ShakeCluster;
    struct ConstraintOrderer;
+    std::map<int, OpenCLArray> noseHooverChainState;
 };
 } // namespace OpenMM

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -36,6 +36,7 @@
 #include "openmm/kernels.h"
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/CustomIntegratorUtilities.h"
+#include "openmm/internal/NoseHooverChain.h"
 #include "lepton/CompiledExpression.h"
 #include "lepton/ExpressionProgram.h"
 #include "openmm/System.h"
@@ -1346,6 +1347,43 @@ private:
    cl::Kernel kernel1, kernel2;
 };
+/*
+ * This kernel is invoked by NoseHooverIntegrator to take one time step.
+ */
+class OpenCLIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
+public:
+    OpenCLIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) :
+                                  IntegrateVelocityVerletStepKernel(name, platform), cl(cl) { }
+    ~OpenCLIntegrateVelocityVerletStepKernel() {}
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the NoseHooverIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const NoseHooverIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the VerletIntegrator this kernel is being used for
+     * @param forcesAreValid a reference to the parent integrator's boolean for keeping
+     *                       track of the validity of the current forces.
+     */
+    void execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid);
+    /**
+     * Compute the kinetic energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the NoseHooverIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
+private:
+    OpenCLContext& cl;
+    cl::Kernel kernel1, kernel2, kernel3;
+};
 /**
 * This kernel is invoked by LangevinIntegrator to take one time step.
 */
@@ -1667,6 +1705,72 @@ private:
    cl::Kernel kernel;
 };
+/**
+ * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
+ * and update the associated particle velocities.
+ */
+class OpenCLNoseHooverChainKernel : public NoseHooverChainKernel {
+public:
+    OpenCLNoseHooverChainKernel(std::string name, const Platform& platform, OpenCLContext& cl) : NoseHooverChainKernel(name, platform), cl(cl) {
+    }
+    ~OpenCLNoseHooverChainKernel() {}
+    /**
+     * Initialize the kernel.
+     */
+    virtual void initialize();
+    /**
+     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
+     * 
+     * @param context  the context in which to execute this kernel
+     * @param noseHooverChain the object describing the chain to be propagated.
+     * @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
+     * @param timeStep the time step used by the integrator.
+     * @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
+     */
+    virtual std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
+    /**
+     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @return the total heat bath energy.
+     */
+    virtual double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
+    /**
+     * Execute the kernel that computes the kinetic energy for a subset of atoms,
+     * or the relative kinetic energy of Drude particles with respect to their parent atoms
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param downloadValue whether the computed value should be downloaded and returned.
+     *
+     */
+     virtual std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
+    /**
+     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
+     */
+    virtual void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);
+private:
+    int sumWorkGroupSize;
+    OpenCLContext& cl;
+    OpenCLArray scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
+    std::map<int, OpenCLArray> atomlists, pairlists;
+    std::map<int, cl::Kernel> propagateKernels;
+    cl::Kernel reduceEnergyKernel;
+    cl::Kernel computeHeatBathEnergyKernel;
+    cl::Kernel computeAtomsKineticEnergyKernel;
+    cl::Kernel computePairsKineticEnergyKernel;
+    cl::Kernel scaleAtomsVelocitiesKernel;
+    cl::Kernel scalePairsVelocitiesKernel;
+};
 /**
 * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
 */

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -848,6 +848,24 @@ int OpenCLIntegrationUtilities::prepareRandomNumbers(int numValues) {
 }
 void OpenCLIntegrationUtilities::createCheckpoint(ostream& stream) {
+    size_t numChains = noseHooverChainState.size();
+    bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
+    stream.write((char*) &numChains, sizeof(size_t));
+    for (auto &chainState: noseHooverChainState){
+        int chainID = chainState.first;
+        size_t chainLength = chainState.second.getSize();
+        stream.write((char*) &chainID, sizeof(int));
+        stream.write((char*) &chainLength, sizeof(size_t));
+        if (useDouble) {
+            vector<mm_double2> stateVec;
+            chainState.second.download(stateVec);
+            stream.write((char*) stateVec.data(), sizeof(mm_double2)*chainLength);
+        } else {
+            vector<mm_float2> stateVec;
+            chainState.second.download(stateVec);
+            stream.write((char*) stateVec.data(), sizeof(mm_float2)*chainLength);
+        }
+    }
    if (!random.isInitialized()) 
        return;
    stream.write((char*) &randomPos, sizeof(int));
@@ -860,6 +878,28 @@ void OpenCLIntegrationUtilities::createCheckpoint(ostream& stream) {
 }
 void OpenCLIntegrationUtilities::loadCheckpoint(istream& stream) {
+    size_t numChains, chainLength;
+    bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
+    stream.read((char*) &numChains, sizeof(size_t));
+    noseHooverChainState.clear();
+    for (size_t i=0; i<numChains; i++){
+        int chainID;
+        stream.read((char*) &chainID, sizeof(int));
+        stream.read((char*) &chainLength, sizeof(size_t));
+        if (useDouble) {
+            noseHooverChainState[chainID] = OpenCLArray();
+            noseHooverChainState[chainID].initialize<mm_double2>(context, chainLength, "chainState" + std::to_string(chainID));
+            std::vector<mm_double2> stateVec(chainLength);
+            stream.read((char*) &stateVec[0], sizeof(mm_double2)*chainLength);
+            noseHooverChainState[chainID].upload(stateVec);
+        } else {
+            noseHooverChainState[chainID] = OpenCLArray();
+            noseHooverChainState[chainID].initialize<mm_float2>(context, chainLength, "chainState" + std::to_string(chainID));
+            std::vector<mm_float2> stateVec(chainLength);
+            stream.read((char*) &stateVec[0], sizeof(mm_float2)*chainLength);
+            noseHooverChainState[chainID].upload(stateVec);
+        }
+    }
    if (!random.isInitialized()) 
        return; 
    stream.read((char*) &randomPos, sizeof(int));
@@ -918,3 +958,7 @@ double OpenCLIntegrationUtilities::computeKineticEnergy(double timeShift) {
        posDelta.copyTo(context.getVelm());
    return 0.5*energy;
 }
+std::map<int, OpenCLArray>& OpenCLIntegrationUtilities::getNoseHooverChainState(){
+    return noseHooverChainState;
+};
--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -128,6 +128,10 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
        return new OpenCLIntegrateCustomStepKernel(name, platform, cl);
    if (name == ApplyAndersenThermostatKernel::Name())
        return new OpenCLApplyAndersenThermostatKernel(name, platform, cl);
+    if (name == NoseHooverChainKernel::Name())
+        return new OpenCLNoseHooverChainKernel(name, platform, cl);
+    if (name == IntegrateVelocityVerletStepKernel::Name())
+        return new OpenCLIntegrateVelocityVerletStepKernel(name, platform, cl);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new OpenCLApplyMonteCarloBarostatKernel(name, platform, cl);
    if (name == RemoveCMMotionKernel::Name())

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -53,6 +53,7 @@
 #include "SimTKOpenMMUtilities.h"
 #include "jama_eig.h"
 #include <algorithm>
+#include <assert.h>
 #include <cmath>
 #include <iterator>
 #include <set>
@@ -413,7 +414,7 @@ void OpenCLUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, co
 }
 void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
-    int version = 2;
+    int version = 3;
    stream.write((char*) &version, sizeof(int));
    int precision = (cl.getUseDoublePrecision() ? 2 : cl.getUseMixedPrecision() ? 1 : 0);
    stream.write((char*) &precision, sizeof(int));
@@ -444,7 +445,7 @@ void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream
 void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
    int version;
    stream.read((char*) &version, sizeof(int));
-    if (version != 2)
+    if (version != 3)
        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
    int precision;
    stream.read((char*) &precision, sizeof(int));
@@ -7391,6 +7392,79 @@ double OpenCLIntegrateVerletStepKernel::computeKineticEnergy(ContextImpl& contex
    return cl.getIntegrationUtilities().computeKineticEnergy(0.5*integrator.getStepSize());
 }
+void OpenCLIntegrateVelocityVerletStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
+    cl.getPlatformData().initializeContexts(system);
+    map<string, string> defines;
+    cl::Program program = cl.createProgram(OpenCLKernelSources::velocityVerlet, defines, "");
+    kernel1 = cl::Kernel(program, "integrateVelocityVerletPart1");
+    kernel2 = cl::Kernel(program, "integrateVelocityVerletPart2");
+    kernel3 = cl::Kernel(program, "integrateVelocityVerletPart3");
+}
+void OpenCLIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
+    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
+    int numAtoms = cl.getNumAtoms();
+    int paddedNumAtoms = cl.getPaddedNumAtoms();
+    double dt = integrator.getStepSize();
+    cl.getIntegrationUtilities().setNextStepSize(dt);
+    if( !forcesAreValid ) context.calcForcesAndEnergy(true, false);
+    //// Call the first integration kernel.
+    kernel1.setArg<cl_int>(0, numAtoms);
+    kernel1.setArg<cl_int>(1, paddedNumAtoms);
+    kernel1.setArg<cl::Buffer>(2, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(3, cl.getPosq().getDeviceBuffer());
+    setPosqCorrectionArg(cl, kernel1, 4);
+    kernel1.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(7, integration.getPosDelta().getDeviceBuffer());
+    cl.executeKernel(kernel1, numAtoms);
+    //// Apply constraints.
+    integration.applyConstraints(integrator.getConstraintTolerance());
+    //// Call the second integration kernel.
+    kernel2.setArg<cl_int>(0, numAtoms);
+    kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
+    kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
+    setPosqCorrectionArg(cl, kernel2, 3);
+    kernel2.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
+    kernel2.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
+    cl.executeKernel(kernel2, numAtoms);
+    integration.computeVirtualSites();
+    //// Update forces
+    context.calcForcesAndEnergy(true, false);
+    forcesAreValid = true;
+    //// Call the third integration kernel.
+    kernel3.setArg<cl_int>(0, numAtoms);
+    kernel3.setArg<cl_int>(1, paddedNumAtoms);
+    kernel3.setArg<cl::Buffer>(2, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
+    kernel3.setArg<cl::Buffer>(3, cl.getPosq().getDeviceBuffer());
+    setPosqCorrectionArg(cl, kernel3, 4);
+    kernel3.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer());
+    kernel3.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer());
+    kernel3.setArg<cl::Buffer>(7, integration.getPosDelta().getDeviceBuffer());
+    cl.executeKernel(kernel3, numAtoms);
+    integration.applyVelocityConstraints(integrator.getConstraintTolerance());
+    //// Update the time and step count.
+    cl.setTime(cl.getTime()+dt);
+    cl.setStepCount(cl.getStepCount()+1);
+    cl.reorderAtoms();
+}
+double OpenCLIntegrateVelocityVerletStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
+    return cl.getIntegrationUtilities().computeKineticEnergy(0);
+}
 void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
    cl.getPlatformData().initializeContexts(system);
    cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
@@ -8682,6 +8756,348 @@ void OpenCLApplyAndersenThermostatKernel::execute(ContextImpl& context) {
    kernel.setArg<cl_uint>(5, cl.getIntegrationUtilities().prepareRandomNumbers(cl.getPaddedNumAtoms()));
    cl.executeKernel(kernel, cl.getNumAtoms());
 }
+void OpenCLNoseHooverChainKernel::initialize() {
+    bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
+    map<string, string> defines;
+    defines["BEGIN_YS_LOOP"] = "const real arr[1] = {1.0}; for(int i=0;i<1;++i) { const real ys = arr[i];";
+    defines["END_YS_LOOP"] = "}";
+    cl::Program program = cl.createProgram(OpenCLKernelSources::noseHooverChain, defines);
+    propagateKernels[1] = cl::Kernel(program, "propagateNoseHooverChain");
+    defines["BEGIN_YS_LOOP"] = "const real arr[3] = {0.828981543588751, -0.657963087177502, 0.828981543588751}; for(int i=0;i<3;++i) { const real ys = arr[i];";
+    program = cl.createProgram(OpenCLKernelSources::noseHooverChain, defines);
+    propagateKernels[3] = cl::Kernel(program, "propagateNoseHooverChain");
+    defines["BEGIN_YS_LOOP"] = "const real arr[5] = {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065}; for(int i=0;i<5;++i) { const real ys = arr[i];";
+    program = cl.createProgram(OpenCLKernelSources::noseHooverChain, defines);
+    propagateKernels[5] = cl::Kernel(program, "propagateNoseHooverChain");
+    program = cl.createProgram(OpenCLKernelSources::noseHooverChain, defines);
+    reduceEnergyKernel = cl::Kernel(program, "reduceEnergyPair");
+    computeHeatBathEnergyKernel = cl::Kernel(program, "computeHeatBathEnergy");
+    computeAtomsKineticEnergyKernel = cl::Kernel(program, "computeAtomsKineticEnergy");
+    computePairsKineticEnergyKernel = cl::Kernel(program, "computePairsKineticEnergy");
+    scaleAtomsVelocitiesKernel = cl::Kernel(program, "scaleAtomsVelocities");
+    scalePairsVelocitiesKernel = cl::Kernel(program, "scalePairsVelocities");
+}
+std::pair<double, double> OpenCLNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
+    bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
+    int chainID = nhc.getDefaultChainID();
+    int nAtoms = nhc.getThermostatedAtoms().size();
+    int nPairs = nhc.getThermostatedPairs().size();
+    int chainLength = nhc.getDefaultChainLength();
+    int numYS = nhc.getDefaultNumYoshidaSuzukiTimeSteps();
+    int numMTS = nhc.getDefaultNumMultiTimeSteps();
+    int numDOFs = nhc.getDefaultNumDegreesOfFreedom();
+    double temperature = nhc.getDefaultTemperature();
+    double frequency = nhc.getDefaultCollisionFrequency();
+    double relativeTemperature = nhc.getDefaultRelativeTemperature();
+    double relativeFrequency = nhc.getDefaultRelativeCollisionFrequency();
+    if (numYS != 1 && numYS != 3 && numYS != 5) {
+        throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
+    }
+    auto & chainState = cl.getIntegrationUtilities().getNoseHooverChainState();
+    if (!scaleFactorBuffer.isInitialized() ||scaleFactorBuffer.getSize() == 0) {
+        if(useDouble){
+            std::vector<mm_double2> zeros{{0,0}};
+            scaleFactorBuffer.initialize<mm_double2>(cl, 1, "scaleFactorBuffer");
+            scaleFactorBuffer.upload(zeros);
+        } else {
+            std::vector<mm_float2> zeros{{0,0}};
+            scaleFactorBuffer.initialize<mm_float2>(cl, 1, "scaleFactorBuffer");
+            scaleFactorBuffer.upload(zeros);
+        }
+    }
+    if (!chainForces.isInitialized() || !chainMasses.isInitialized() ){
+        if(useDouble){
+            std::vector<cl_double> zeros(chainLength,0);
+            chainMasses.initialize<cl_double>(cl, chainLength, "chainMasses");
+            chainForces.initialize<cl_double>(cl, chainLength, "chainForces");
+            chainMasses.upload(zeros);
+            chainForces.upload(zeros);
+        } else {
+            std::vector<cl_float> zeros(chainLength,0);
+            chainMasses.initialize<cl_float>(cl, chainLength, "chainMasses");
+            chainForces.initialize<cl_float>(cl, chainLength, "chainForces");
+            chainMasses.upload(zeros);
+            chainForces.upload(zeros);
+        }
+    }
+    if (chainForces.getSize() < chainLength) chainMasses.resize(chainLength);
+    if (chainMasses.getSize() < chainLength) chainMasses.resize(chainLength);
+    float timeStepFloat = (float) timeStep;
+    // N.B. We ignore the incoming kineticEnergy and grab it from the device buffer instead
+    if (nAtoms) {
+        if (!chainState.count(2*chainID))  chainState[2*chainID] = OpenCLArray();
+        if (chainState.at(2*chainID).getSize() != chainLength) {
+            // We need to upload the OpenCL array
+            if(useDouble){
+                chainState.at(2*chainID).initialize<mm_double2>(cl, chainLength, "chainState" + std::to_string(2*chainID));
+                std::vector<mm_double2> zeros(chainLength, mm_double2(0.0, 0.0));
+                chainState.at(2*chainID).upload(zeros.data());
+            } else {
+                chainState.at(2*chainID).initialize<mm_float2>(cl, chainLength, "chainState" + std::to_string(2*chainID));
+                std::vector<mm_float2> zeros(chainLength, mm_float2(0.0f, 0.0f));
+                chainState.at(2*chainID).upload(zeros.data());
+            }
+        }
+        int chainType = 0;
+        double kT = BOLTZ * temperature;
+        float kTfloat = (float) kT;
+        float frequencyFloat = (float) frequency;
+        propagateKernels[numYS].setArg<cl::Buffer>(0, chainState[2*chainID].getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl::Buffer>(1, kineticEnergyBuffer.getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl::Buffer>(2, scaleFactorBuffer.getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl::Buffer>(3, chainMasses.getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl::Buffer>(4, chainForces.getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl_int>(5, chainType);
+        propagateKernels[numYS].setArg<cl_int>(6, chainLength);
+        propagateKernels[numYS].setArg<cl_int>(7, numMTS);
+        propagateKernels[numYS].setArg<cl_int>(8, numDOFs);
+        propagateKernels[numYS].setArg<cl_float>(9, timeStepFloat);
+        if (useDouble) 
+            propagateKernels[numYS].setArg<cl_double>(10, kT);
+        else
+            propagateKernels[numYS].setArg<cl_float>(10, kTfloat);
+        propagateKernels[numYS].setArg<cl_float>(11, frequencyFloat);
+        cl.executeKernel(propagateKernels[numYS], 1, 1);
+    }
+    if (nPairs) {
+        if (!chainState.count(2*chainID+1)) chainState[2*chainID+1] = OpenCLArray();
+        if (chainState.at(2*chainID+1).getSize() != chainLength) {
+            // We need to upload the OpenCL array
+            if(useDouble){
+                chainState.at(2*chainID+1).initialize<mm_double2>(cl, chainLength, "chainState" + std::to_string(2*chainID+1));
+                std::vector<mm_double2> zeros(chainLength, mm_double2(0.0, 0.0));
+                chainState.at(2*chainID+1).upload(zeros.data());
+            } else {
+                chainState.at(2*chainID+1).initialize<mm_float2>(cl, chainLength, "chainState" + std::to_string(2*chainID+1));
+                std::vector<mm_float2> zeros(chainLength, mm_float2(0.0f, 0.0f));
+                chainState.at(2*chainID+1).upload(zeros.data());
+            }
+        }
+        int chainType = 1;
+        double kT = BOLTZ * relativeTemperature;
+        int ndf = 3*nPairs;
+        float kTfloat = (float) kT;
+        float frequencyFloat = (float) relativeFrequency;
+        propagateKernels[numYS].setArg<cl::Buffer>(0, chainState[2*chainID+1].getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl::Buffer>(1, kineticEnergyBuffer.getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl::Buffer>(2, scaleFactorBuffer.getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl::Buffer>(3, chainMasses.getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl::Buffer>(4, chainForces.getDeviceBuffer());
+        propagateKernels[numYS].setArg<cl_int>(5, chainType);
+        propagateKernels[numYS].setArg<cl_int>(6, chainLength);
+        propagateKernels[numYS].setArg<cl_int>(7, numMTS);
+        propagateKernels[numYS].setArg<cl_int>(8, ndf);
+        propagateKernels[numYS].setArg<cl_float>(9, timeStepFloat);
+        if (useDouble) 
+            propagateKernels[numYS].setArg<cl_double>(10, kT);
+        else
+            propagateKernels[numYS].setArg<cl_float>(10, kTfloat);
+        propagateKernels[numYS].setArg<cl_float>(11, frequencyFloat);
+        cl.executeKernel(propagateKernels[numYS], 1, 1);
+    }
+    return {0, 0};
+}
+double OpenCLNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
+    bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
+    int chainID = nhc.getDefaultChainID();
+    int chainLength = nhc.getDefaultChainLength();
+    auto & chainState = cl.getIntegrationUtilities().getNoseHooverChainState();
+    bool absChainIsValid = chainState.count(2*chainID) != 0 &&
+                           chainState[2*chainID].isInitialized() &&
+                           chainState[2*chainID].getSize() == chainLength;
+    bool relChainIsValid = chainState.count(2*chainID+1) != 0 &&
+                           chainState[2*chainID+1].isInitialized() &&
+                           chainState[2*chainID+1].getSize() == chainLength;
+    if (!absChainIsValid && !relChainIsValid) return 0.0;
+    if (!heatBathEnergy.isInitialized() || heatBathEnergy.getSize() == 0) {
+        if(useDouble){
+            std::vector<cl_double> one(1);
+            heatBathEnergy.initialize<cl_double>(cl, 1, "heatBathEnergy");
+            heatBathEnergy.upload(one);
+        } else {
+            std::vector<cl_float> one(1);
+            heatBathEnergy.initialize<cl_float>(cl, 1, "heatBathEnergy");
+            heatBathEnergy.upload(one);
+        }
+    }
+    cl.clearBuffer(heatBathEnergy);
+    if (absChainIsValid) {
+        int numDOFs = nhc.getDefaultNumDegreesOfFreedom();
+        double temperature = nhc.getDefaultTemperature();
+        double frequency = nhc.getDefaultCollisionFrequency();
+        double kT = BOLTZ * temperature;
+        float kTfloat = (float) kT;
+        float frequencyFloat = (float) frequency;
+        computeHeatBathEnergyKernel.setArg<cl::Buffer>(0, heatBathEnergy.getDeviceBuffer());
+        computeHeatBathEnergyKernel.setArg<cl_int>(1, chainLength);
+        computeHeatBathEnergyKernel.setArg<cl_int>(2, numDOFs); 
+        if (useDouble)
+            computeHeatBathEnergyKernel.setArg<cl_double>(3, kT);
+        else
+            computeHeatBathEnergyKernel.setArg<cl_float>(3, kTfloat);
+        computeHeatBathEnergyKernel.setArg<cl_float>(4, frequencyFloat);
+        computeHeatBathEnergyKernel.setArg<cl::Buffer>(5, chainState[2*chainID].getDeviceBuffer());
+        cl.executeKernel(computeHeatBathEnergyKernel, 1, 1);
+    }
+    if (relChainIsValid) {
+        int numDOFs = 3 * nhc.getThermostatedPairs().size();
+        double temperature = nhc.getDefaultRelativeTemperature();
+        double frequency = nhc.getDefaultRelativeCollisionFrequency();
+        double kT = BOLTZ * temperature;
+        float kTfloat = (float) kT;
+        float frequencyFloat = (float) frequency;
+        computeHeatBathEnergyKernel.setArg<cl::Buffer>(0, heatBathEnergy.getDeviceBuffer());
+        computeHeatBathEnergyKernel.setArg<cl_int>(1, chainLength);
+        computeHeatBathEnergyKernel.setArg<cl_int>(2, numDOFs); 
+        if (useDouble)
+            computeHeatBathEnergyKernel.setArg<cl_double>(3, kT);
+        else
+            computeHeatBathEnergyKernel.setArg<cl_float>(3, kTfloat);
+        computeHeatBathEnergyKernel.setArg<cl_float>(4, frequencyFloat);
+        computeHeatBathEnergyKernel.setArg<cl::Buffer>(5, chainState[2*chainID+1].getDeviceBuffer());
+        cl.executeKernel(computeHeatBathEnergyKernel, 1, 1);
+    }
+    void * pinnedBuffer = cl.getPinnedBuffer();
+    heatBathEnergy.download(pinnedBuffer);
+    if (useDouble){
+        return *((double*) pinnedBuffer);
+    } else {
+        return *((float*) pinnedBuffer);
+    }
+}
+std::pair<double, double> OpenCLNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
+    bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
+    int chainID = nhc.getDefaultChainID();
+    const auto & nhcAtoms = nhc.getThermostatedAtoms();
+    const auto & nhcPairs = nhc.getThermostatedPairs();
+    auto nAtoms = nhcAtoms.size();
+    auto nPairs = nhcPairs.size();
+    if (nAtoms) {
+        if (!atomlists.count(chainID)) { 
+            // We need to upload the OpenCL array
+            atomlists[chainID] = OpenCLArray();
+            atomlists[chainID].initialize<int>(cl, nAtoms, "atomlist" + std::to_string(chainID));
+            atomlists[chainID].upload(nhcAtoms);
+        }
+        if (atomlists[chainID].getSize() != nAtoms) {
+            throw OpenMMException("Number of atoms changed. Cannot be handled by the same Nose-Hoover thermostat.");
+        }
+    }
+    if (nPairs) {
+        if (!pairlists.count(chainID)) { 
+            // We need to upload the OpenCL array
+            pairlists[chainID] = OpenCLArray();
+            pairlists[chainID].initialize<mm_int2>(cl, nPairs, "pairlist" + std::to_string(chainID));
+            std::vector<mm_int2> int2vec;
+            for(const auto &p : nhcPairs) int2vec.push_back(mm_int2(p.first, p.second));
+            pairlists[chainID].upload(int2vec);
+        }
+        if (pairlists[chainID].getSize() != nPairs) {
+            throw OpenMMException("Number of thermostated pairs changed. Cannot be handled by the same Nose-Hoover thermostat.");
+        }
+    }
+    if (!kineticEnergyBuffer.isInitialized() || kineticEnergyBuffer.getSize() == 0) {
+        if(useDouble){
+            std::vector<mm_double2> zeros{{0,0}};
+            kineticEnergyBuffer.initialize<mm_double2>(cl, 1, "kineticEnergyBuffer");
+            kineticEnergyBuffer.upload(zeros);
+        } else {
+            std::vector<mm_float2> zeros{{0,0}};
+            kineticEnergyBuffer.initialize<mm_float2>(cl, 1, "kineticEnergyBuffer");
+            kineticEnergyBuffer.upload(zeros);
+        }
+    }
+    cl.clearBuffer(cl.getEnergyBuffer());
+    if (nAtoms) {
+        computeAtomsKineticEnergyKernel.setArg<cl::Buffer>(0, cl.getEnergyBuffer().getDeviceBuffer());
+        computeAtomsKineticEnergyKernel.setArg<cl_int>(1, nAtoms);
+        computeAtomsKineticEnergyKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
+        computeAtomsKineticEnergyKernel.setArg<cl::Buffer>(3, atomlists[chainID].getDeviceBuffer());
+        cl.executeKernel(computeAtomsKineticEnergyKernel, nAtoms);
+    }
+    if (nPairs) {
+        computePairsKineticEnergyKernel.setArg<cl::Buffer>(0, cl.getEnergyBuffer().getDeviceBuffer());
+        computePairsKineticEnergyKernel.setArg<cl_int>(1, nPairs);
+        computePairsKineticEnergyKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
+        computePairsKineticEnergyKernel.setArg<cl::Buffer>(3, pairlists[chainID].getDeviceBuffer());
+        cl.executeKernel(computePairsKineticEnergyKernel, nPairs);
+    }
+    int bufferSize = cl.getEnergyBuffer().getSize() / 2; // Halve it to account for the fact that we're storing mixed2 instead of mixed in there
+    int workGroupSize  = cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+    if (workGroupSize > 512)
+        workGroupSize = 512;
+    reduceEnergyKernel.setArg<cl::Buffer>(0, cl.getEnergyBuffer().getDeviceBuffer());
+    reduceEnergyKernel.setArg<cl::Buffer>(1, kineticEnergyBuffer.getDeviceBuffer());
+    reduceEnergyKernel.setArg<cl_int>(2, bufferSize);
+    reduceEnergyKernel.setArg<cl_int>(3, workGroupSize);
+    reduceEnergyKernel.setArg(4, 2*workGroupSize*cl.getEnergyBuffer().getElementSize(), NULL);
+    cl.executeKernel(reduceEnergyKernel, workGroupSize, workGroupSize);
+    std::pair<double, double> KEs = {0, 0};
+    if (downloadValue) {
+        if (useDouble) {
+            mm_double2 tmp;
+            kineticEnergyBuffer.download(&tmp);
+            KEs.first = tmp.x;
+            KEs.second = tmp.y;
+        } else {
+            mm_float2 tmp;
+            kineticEnergyBuffer.download(&tmp);
+            KEs.first = tmp.x;
+            KEs.second = tmp.y;
+        }
+    }
+    return KEs;
+}
+void OpenCLNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
+    // For now we assume that the atoms and pairs info is valid, because compute{Atoms|Pairs}KineticEnergy must have been
+    // called before this kernel.  If that ever ceases to be true, some sanity checks are needed here.
+    int chainID = nhc.getDefaultChainID();
+    auto nAtoms = nhc.getThermostatedAtoms().size();
+    auto nPairs = nhc.getThermostatedPairs().size();
+    if(nAtoms) {
+        scaleAtomsVelocitiesKernel.setArg<cl::Buffer>(0, scaleFactorBuffer.getDeviceBuffer());
+        scaleAtomsVelocitiesKernel.setArg<cl_int>(1, nAtoms);
+        scaleAtomsVelocitiesKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
+        scaleAtomsVelocitiesKernel.setArg<cl::Buffer>(3, atomlists[chainID].getDeviceBuffer());
+        cl.executeKernel(scaleAtomsVelocitiesKernel, nAtoms);
+    }
+    if(nPairs) {
+        scalePairsVelocitiesKernel.setArg<cl::Buffer>(0, scaleFactorBuffer.getDeviceBuffer());
+        scalePairsVelocitiesKernel.setArg<cl_int>(1, nPairs);
+        scalePairsVelocitiesKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
+        scalePairsVelocitiesKernel.setArg<cl::Buffer>(3, pairlists[chainID].getDeviceBuffer());
+        cl.executeKernel(scalePairsVelocitiesKernel, nPairs);
+    }
+}
 void OpenCLApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
    savedPositions.initialize(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedPositions");

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -87,12 +87,14 @@ OpenCLPlatform::OpenCLPlatform() {
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVelocityVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
+    registerKernelFactory(NoseHooverChainKernel::Name(), factory);
    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
    platformProperties.push_back(OpenCLDeviceIndex());

--- a/platforms/opencl/src/kernels/diffcucl.txt
+++ b/platforms/opencl/src/kernels/diffcucl.txt
+2c2
+<  * Perform the first step of Verlet integration.
+---
+>  * Perform the first step of verlet integration.
+5,11c5,10
+< extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
+<         const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
+<     const mixed2 stepSize = dt[0];
+<     const mixed dtPos = stepSize.y;
+<     const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+<     const mixed scale = dtVel/(mixed) 0x100000000;
+<     for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+---
+> __kernel void integrateVerletPart1(int numAtoms, __global const mixed2* restrict dt, __global const real4* restrict posq, __global const real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta) {
+>     mixed2 stepSize = dt[0];
+>     mixed dtPos = stepSize.y;
+>     mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+>     int index = get_global_id(0);
+>     while (index < numAtoms) {
+17c16
+<             mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+---
+>             mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+21,26c20,23
+<             velocity.x += scale*force[index]*velocity.w;
+<             velocity.y += scale*force[index+paddedNumAtoms]*velocity.w;
+<             velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w;
+<             pos.x = velocity.x*dtPos;
+<             pos.y = velocity.y*dtPos;
+<             pos.z = velocity.z*dtPos;
+---
+>             velocity.x += force[index].x*dtVel*velocity.w;
+>             velocity.y += force[index].y*dtVel*velocity.w;
+>             velocity.z += force[index].z*dtVel*velocity.w;
+>             pos.xyz = velocity.xyz*dtPos;
+29a27
+>         index += get_global_size(0);
+34c32
+<  * Perform the second step of Verlet integration.
+---
+>  * Perform the second step of verlet integration.
+37,38c35
+< extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
+<         real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
+---
+> __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, __global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const mixed4* restrict posDelta) {
+40c37
+< #if __CUDA_ARCH__ >= 130
+---
+> #ifdef SUPPORTS_DOUBLE_PRECISION
+46,47c43
+<     int index = blockIdx.x*blockDim.x+threadIdx.x;
+<     if (index == 0)
+---
+>     if (get_global_id(0) == 0)
+49c45,47
+<     for (; index < numAtoms; index += blockDim.x*gridDim.x) {
+---
+>     barrier(CLK_LOCAL_MEM_FENCE);
+>     int index = get_global_id(0);
+>     while (index < numAtoms) {
+55c53
+<             mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+---
+>             mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+60,64c58,60
+<             pos.x += delta.x;
+<             pos.y += delta.y;
+<             pos.z += delta.z;
+< #if __CUDA_ARCH__ >= 130
+<             velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
+---
+>             pos.xyz += delta.xyz;
+> #ifdef SUPPORTS_DOUBLE_PRECISION
+>             velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz;
+66c62
+<             velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w);
+---
+>             velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction;
+69,70c65,66
+<             posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+<             posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+---
+>             posq[index] = convert_real4(pos);
+>             posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+75a72
+>         index += get_global_size(0);
+83c80
+< extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
+---
+> __kernel void selectVerletStepSize(int numAtoms, mixed maxStepSize, mixed errorTol, __global mixed2* restrict dt, __global const mixed4* restrict velm, __global const real4* restrict force, __local mixed* restrict error) {
+86,90c83,86
+<     extern __shared__ mixed error[];
+<     mixed err = 0.0f;
+<     const mixed scale = RECIP((mixed) 0x100000000);
+<     for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+<         mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
+---
+>     mixed err = 0;
+>     int index = get_local_id(0);
+>     while (index < numAtoms) {
+>         real4 f = force[index];
+92a89
+>         index += get_global_size(0);
+94,95c91,92
+<     error[threadIdx.x] = err;
+<     __syncthreads();
+---
+>     error[get_local_id(0)] = err;
+>     barrier(CLK_LOCAL_MEM_FENCE);
+99,102c96,99
+<     for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) {
+<         if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
+<             error[threadIdx.x] += error[threadIdx.x+offset];
+<         __syncthreads();
+---
+>     for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
+>         if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0)
+>             error[get_local_id(0)] += error[get_local_id(0)+offset];
+>         barrier(CLK_LOCAL_MEM_FENCE);
+104,106c101,103
+<     if (threadIdx.x == 0) {
+<         mixed totalError = SQRT(error[0]/(numAtoms*3));
+<         mixed newStepSize = SQRT(errorTol/totalError);
+---
+>     if (get_local_id(0) == 0) {
+>         mixed totalError = sqrt(error[0]/(numAtoms*3));
+>         mixed newStepSize = sqrt(errorTol/totalError);
--- a/platforms/opencl/src/kernels/noseHooverChain.cl
+++ b/platforms/opencl/src/kernels/noseHooverChain.cl
+//#include <initializer_list>
+__kernel void propagateNoseHooverChain(__global mixed2* restrict chainData, __global const mixed2 * restrict energySum, __global mixed2* restrict scaleFactor,
+                                                    __global mixed* restrict chainMasses, __global mixed* restrict chainForces, 
+                                                    int chainType, int chainLength, int numMTS, int numDOFs, float timeStep,
+                                                    mixed kT, float frequency){
+    const mixed kineticEnergy = chainType == 0 ? energySum[0].x : energySum[0].y;
+    mixed scale = 1;
+    if(kineticEnergy < 1e-8) return;
+    for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
+    chainMasses[0] *= numDOFs;
+    mixed KE2 = 2.0f * kineticEnergy;
+    mixed timeOverMTS = timeStep / numMTS;
+    chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
+    for (int bead = 0; bead < chainLength - 1; ++bead) {
+        chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+    }
+    for (int mts = 0; mts < numMTS; ++mts) {
+        BEGIN_YS_LOOP
+            mixed wdt = ys * timeOverMTS;
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+            for (int bead = chainLength - 2; bead >= 0; --bead) {
+                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
+            }
+            // update particle velocities
+            mixed aa = EXP(-0.5f * wdt * chainData[0].y);
+            scale *= aa;
+            // update the thermostat positions
+            for (int bead = 0; bead < chainLength; ++bead) {
+                chainData[bead].x += 0.5f * chainData[bead].y * wdt;
+            }
+            // update the forces
+            chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
+            // update thermostat velocities
+            for (int bead = 0; bead < chainLength - 1; ++bead) {
+                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
+                chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+            }
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+        END_YS_LOOP
+    } // MTS loop
+    if (chainType == 0) {
+        scaleFactor[0].x = scale;
+    } else {
+        scaleFactor[0].y = scale;
+    }
+}
+/**
+ * Compute total (potential + kinetic) energy of the Nose-Hoover beads
+ */
+__kernel void computeHeatBathEnergy(__global mixed* restrict heatBathEnergy, int chainLength, int numDOFs,
+                                                 mixed kT, float frequency, __global const mixed2* restrict chainData){
+    // Note that this is always incremented; make sure it's zeroed properly before the first call
+    for(int i = 0; i < chainLength; ++i) {
+        mixed prefac = i ? 1 : numDOFs;
+        mixed mass = prefac * kT / (frequency * frequency);
+        mixed velocity = chainData[i].y; 
+        // The kinetic energy of this bead
+        heatBathEnergy[0] += 0.5f * mass * velocity * velocity;
+        // The potential energy of this bead
+        mixed position = chainData[i].x;
+        heatBathEnergy[0] += prefac * kT * position;
+    }
+}
+__kernel void computeAtomsKineticEnergy(__global mixed2 * restrict energyBuffer, int numAtoms,
+                                        __global const mixed4* restrict velm, __global const int *restrict atoms){
+    mixed2 energy = (mixed2) (0,0);
+    //energy = 1; return;
+    int index = get_global_id(0);
+    while (index < numAtoms){
+        int atom = atoms[index];
+        mixed4 v = velm[atom];
+        mixed mass = v.w == 0 ? 0 : 1 / v.w;
+        energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
+        index += get_global_size(0);
+    }
+    energyBuffer[get_global_id(0)] = energy;
+}
+__kernel void computePairsKineticEnergy(__global mixed2 * restrict energyBuffer, int numPairs,
+                                        __global const mixed4* restrict velm, __global const int2 *restrict pairs){
+    mixed2 energy = (mixed2) (0,0);
+    int index = get_global_id(0);
+    while (index < numPairs){
+        int2 pair = pairs[index];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
+        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
+        mixed4 cv;
+        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
+        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
+        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
+        mixed4 rv;
+        rv.x = v2.x - v1.x;
+        rv.y = v2.y - v1.y;
+        rv.z = v2.z - v1.z;
+        energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
+        energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
+        index += get_global_size(0);
+    }
+    // The atoms version of this has been called already, so accumulate instead of assigning here
+    energyBuffer[get_global_id(0)].xy += energy.xy;
+}
+__kernel void scaleAtomsVelocities(__global mixed2* restrict scaleFactor, int numAtoms,
+                                   __global mixed4* restrict velm, __global const int *restrict atoms){
+    const mixed scale = scaleFactor[0].x;
+    int index = get_global_id(0);
+    while (index < numAtoms){
+        int atom = atoms[index];
+        velm[atom].x *= scale;
+        velm[atom].y *= scale;
+        velm[atom].z *= scale;
+        index += get_global_size(0);
+    }
+}
+__kernel void scalePairsVelocities(__global mixed2 * restrict scaleFactor, int numPairs,
+                                   __global mixed4* restrict velm, __global const int2 *restrict pairs){
+    int index = get_global_id(0);
+    while (index < numPairs){
+        int atom1 = pairs[index].x;
+        int atom2 = pairs[index].y;
+        mixed m1 = velm[atom1].w == 0 ? 0 : 1 / velm[atom1].w;
+        mixed m2 = velm[atom2].w == 0 ? 0 : 1 / velm[atom2].w;
+        mixed4 cv;
+        cv.xyz = (m1*velm[atom1].xyz + m2*velm[atom2].xyz) / (m1 + m2);
+        mixed4 rv;
+        rv.xyz = velm[atom2].xyz - velm[atom1].xyz;
+        velm[atom1].x = scaleFactor[0].x * cv.x - scaleFactor[0].y * rv.x * m2 / (m1 + m2);
+        velm[atom1].y = scaleFactor[0].x * cv.y - scaleFactor[0].y * rv.y * m2 / (m1 + m2);
+        velm[atom1].z = scaleFactor[0].x * cv.z - scaleFactor[0].y * rv.z * m2 / (m1 + m2);
+        velm[atom2].x = scaleFactor[0].x * cv.x + scaleFactor[0].y * rv.x * m1 / (m1 + m2);
+        velm[atom2].y = scaleFactor[0].x * cv.y + scaleFactor[0].y * rv.y * m1 / (m1 + m2);
+        velm[atom2].z = scaleFactor[0].x * cv.z + scaleFactor[0].y * rv.z * m1 / (m1 + m2);
+        index += get_global_size(0);
+    }
+}
+/**
+ * Sum the energy buffer containing a pair of energies stored as mixed2.  This is copied from utilities.cu with small modifications
+ */
+__kernel void reduceEnergyPair(__global const mixed2* restrict energyBuffer, __global mixed2* restrict result, int bufferSize, int workGroupSize, __local mixed2* restrict tempBuffer) {
+    const unsigned int thread = get_local_id(0);
+    mixed2 sum = (mixed2) (0,0);
+    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0)) {
+        sum.xy += energyBuffer[index].xy;
+    }
+    tempBuffer[thread].xy = sum.xy;
+    for (int i = 1; i < workGroupSize; i *= 2) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (thread%(i*2) == 0 && thread+i < workGroupSize) {
+            tempBuffer[thread].xy += tempBuffer[thread+i].xy;
+        }
+    }
+    if (thread == 0) {
+        *result = tempBuffer[0];
+    }
+}
--- a/platforms/opencl/src/kernels/velocityVerlet.cl
+++ b/platforms/opencl/src/kernels/velocityVerlet.cl
+/**
+ * Perform the first step of Velocity Verlet integration.
+ */
+__kernel void integrateVelocityVerletPart1(int numAtoms, int paddedNumAtoms, __global const mixed2* restrict dt, __global const real4* restrict posq,
+        __global const real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta) {
+    const mixed2 stepSize = dt[0];
+    const mixed dtPos = stepSize.y;
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    int index = get_global_id(0);
+    while (index < numAtoms) { 
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            velocity.x += 0.5f * dtVel*force[index].x*velocity.w;
+            velocity.y += 0.5f * dtVel*force[index].y*velocity.w;
+            velocity.z += 0.5f * dtVel*force[index].z*velocity.w;
+            pos.x = velocity.x*dtPos;
+            pos.y = velocity.y*dtPos;
+            pos.z = velocity.z*dtPos;
+            posDelta[index] = pos;
+            velm[index] = velocity;
+        }
+        index += get_global_size(0);
+    }
+}
+/**
+ * Perform the second step of Velocity Verlet integration.
+ */
+__kernel void integrateVelocityVerletPart2(int numAtoms, __global mixed2* restrict dt, __global real4* restrict posq,
+        __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const mixed4* restrict posDelta) {
+    mixed2 stepSize = dt[0];
+    int index = get_global_id(0);
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    while(index < numAtoms) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            mixed4 delta = posDelta[index];
+            pos.xyz += delta.xyz;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+        }
+        index += get_global_size(0);
+    }
+}
+/**
+ * Perform the third step of Velocity Verlet integration.
+ */
+__kernel void integrateVelocityVerletPart3(int numAtoms, int paddedNumAtoms, __global mixed2* restrict dt, __global real4* restrict posq,
+        __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed4* restrict posDelta) {
+    mixed2 stepSize = dt[0];
+#ifndef SUPPORTS_DOUBLE_PRECISION
+    double oneOverDt = 1.0/stepSize.y;
+#else
+    float oneOverDt = 1.0f/stepSize.y;
+    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
+#endif
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    int index = get_global_id(0);
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    while(index < numAtoms) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 deltaXconstrained = posDelta[index];
+            velocity.x += 0.5f * dtVel*force[index].x*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
+            velocity.y += 0.5f * dtVel*force[index].y*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
+            velocity.z += 0.5f * dtVel*force[index].z*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
+#ifdef SUPPORTS_DOUBLE_PRECISION
+            velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
+            velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
+            velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
+#endif
+            velm[index] = velocity;
+        }
+        index += get_global_size(0);
+    }
+}
--- a/platforms/opencl/tests/TestOpenCLNoseHooverIntegrator.cpp
+++ b/platforms/opencl/tests/TestOpenCLNoseHooverIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Andreas Krämer and Andrew C. Simmmonett                           *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "OpenCLTests.h"
+#include "TestNoseHooverIntegrator.h"
+void runPlatformTests() {
+}
--- a/platforms/opencl/tests/TestOpenCLNoseHooverThermostat.cpp
+++ b/platforms/opencl/tests/TestOpenCLNoseHooverThermostat.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Andreas Krämer and Andrew C. Simmonett                            *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "OpenCLTests.h"
+#include "TestNoseHooverThermostat.h"
+void runPlatformTests() {
+}
--- a/plugins/drude/platforms/opencl/tests/CMakeLists.txt
+++ b/plugins/drude/platforms/opencl/tests/CMakeLists.txt
@@ -5,6 +5,7 @@
 ENABLE_TESTING()
 INCLUDE_DIRECTORIES(${OPENCL_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${OPENMM_DIR}/plugins/drude/tests)
 # Automatically create tests using files named "Test*.cpp"
 FILE(GLOB TEST_PROGS "*Test*.cpp")

--- a/plugins/drude/platforms/opencl/tests/TestOpenCLDrudeNoseHoover.cpp
+++ b/plugins/drude/platforms/opencl/tests/TestOpenCLDrudeNoseHoover.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+//#include "ReferenceTests.h"
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/Platform.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "openmm/DrudeForce.h"
+#include "OpenCLPlatform.h"
+#include "SimTKOpenMMUtilities.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+extern "C" OPENMM_EXPORT void registerDrudeOpenCLKernelFactories();
+Platform& initializePlatform(int argc, char* argv[]) {
+    registerDrudeOpenCLKernelFactories();
+    if (argc > 1) Platform::getPlatformByName("OpenCL").setPropertyDefaultValue("Precision", std::string(argv[1]));   
+    return Platform::getPlatformByName("OpenCL");
+}
+#include "TestDrudeNoseHoover.h"
+void runPlatformTests() { }