Initial Nose Hoover common implementation

fd7c5465 · Andy Simmonett · fd263401 · fd7c5465 · fd7c5465 · fd7c5465
Unverified Commit fd7c5465 authored Jan 16, 2020 by Andy Simmonett
6 changed files
--- a/platforms/common/include/openmm/common/CommonKernels.h
+++ b/platforms/common/include/openmm/common/CommonKernels.h
@@ -945,6 +945,116 @@ private:
    ComputeKernel kernel1, kernel2, kernel3, kernel4;
 };

+/*
+ * This kernel is invoked by NoseHooverIntegrator to take one time step.
+ */
+class CommonIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
+public:
+    CommonIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, ComputeContext& cc) :
+                                  IntegrateVelocityVerletStepKernel(name, platform), cc(cc), hasInitializedKernels(false) { }
+    ~CommonIntegrateVelocityVerletStepKernel() {}
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param integrator the NoseHooverIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const NoseHooverIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param integrator the VerletIntegrator this kernel is being used for
+     * @param forcesAreValid a reference to the parent integrator's boolean for keeping
+     *                       track of the validity of the current forces.
+     */
+    void execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid);
+    /**
+     * Compute the kinetic energy.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param integrator the NoseHooverIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
+private:
+    ComputeContext& cc;
+    float prevMaxPairDistance;
+    ComputeArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer;
+    ComputeKernel kernel1, kernel2, kernel3, kernelHardWall;
+    bool hasInitializedKernels;
+};
+
+/**
+ * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
+ * and update the associated particle velocities.
+ */
+class CommonNoseHooverChainKernel : public NoseHooverChainKernel {
+public:
+    CommonNoseHooverChainKernel(std::string name, const Platform& platform, ComputeContext& cc) :
+                 NoseHooverChainKernel(name, platform), cc(cc), hasInitializedPropagateKernel(false),
+                 hasInitializedKineticEnergyKernel(false), hasInitializedHeatBathEnergyKernel(false),
+                 hasInitializedScaleVelocitiesKernel(false) {}
+    ~CommonNoseHooverChainKernel() {}
+    /**
+     * Initialize the kernel.
+     */
+    void initialize();
+    /**
+     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
+     *
+     * @param context  the context in which to execute this kernel
+     * @param noseHooverChain the object describing the chain to be propagated.
+     * @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
+     * @param timeStep the time step used by the integrator.
+     * @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
+     */
+    std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
+    /**
+     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @return the total heat bath energy.
+     */
+    double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
+    /**
+     * Execute the kernel that computes the kinetic energy for a subset of atoms,
+     * or the relative kinetic energy of Drude particles with respect to their parent atoms
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param downloadValue whether the computed value should be downloaded and returned.
+     *
+     */
+    std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
+
+    /**
+     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
+     */
+    void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);
+
+private:
+    int sumWorkGroupSize;
+    ComputeContext& cc;
+    ComputeArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
+    std::map<int, ComputeArray> atomlists, pairlists;
+    std::map<int, ComputeKernel> propagateKernels;
+    bool hasInitializedPropagateKernel;
+    bool hasInitializedKineticEnergyKernel;
+    bool hasInitializedHeatBathEnergyKernel;
+    bool hasInitializedScaleVelocitiesKernel;
+    ComputeKernel reduceEnergyKernel;
+    ComputeKernel computeHeatBathEnergyKernel;
+    ComputeKernel computeAtomsKineticEnergyKernel;
+    ComputeKernel computePairsKineticEnergyKernel;
+    ComputeKernel scaleAtomsVelocitiesKernel;
+    ComputeKernel scalePairsVelocitiesKernel;
+};
+
 /**
 * This kernel is invoked by BrownianIntegrator to take one time step.
 */

--- a/platforms/common/src/CommonKernels.cpp
+++ b/platforms/common/src/CommonKernels.cpp
@@ -5660,6 +5660,522 @@ double CommonIntegrateBAOABStepKernel::computeKineticEnergy(ContextImpl& context
    return cc.getIntegrationUtilities().computeKineticEnergy(0.0);
 }

+void CommonIntegrateVelocityVerletStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
+    cc.initializeContexts();
+    map<string, string> defines;
+    defines["BOLTZ"] = cc.doubleToString(BOLTZ);
+    ComputeProgram program = cc.compileProgram(CommonKernelSources::velocityVerlet, defines);
+    kernel1 = program->createKernel("integrateVelocityVerletPart1");
+    kernel2 = program->createKernel("integrateVelocityVerletPart2");
+    kernel3 = program->createKernel("integrateVelocityVerletPart3");
+    kernelHardWall = program->createKernel("integrateVelocityVerletHardWall");
+    prevMaxPairDistance = -1.0f;
+    maxPairDistanceBuffer.initialize<float>(cc, 1, "maxPairDistanceBuffer");
+}
+
+void CommonIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
+    IntegrationUtilities& integration = cc.getIntegrationUtilities();
+    int paddedNumAtoms = cc.getPaddedNumAtoms();
+    double dt = integrator.getStepSize();
+    cc.getIntegrationUtilities().setNextStepSize(dt);
+
+    // If the atom reordering has occured, the forces from the previous step are permuted and thus invalid.
+    // They need to be either sorted or recomputed; here we choose the latter.
+    if (!forcesAreValid || cc.getAtomsWereReordered()) context.calcForcesAndEnergy(true, false);
+
+    const auto& atomList = integrator.getAllThermostatedIndividualParticles();
+    const auto& pairList = integrator.getAllThermostatedPairs();
+    int numAtoms = atomList.size();
+    int numPairs = pairList.size();
+    int numParticles = numAtoms + 2*numPairs;
+    float maxPairDistance = integrator.getMaximumPairDistance();
+    // Make sure atom and pair metadata is uploaded and has the correct dimensions
+    if (prevMaxPairDistance != maxPairDistance) {
+        std::vector<float> tmp(1, maxPairDistance);
+        maxPairDistanceBuffer.upload(tmp);
+        prevMaxPairDistance = maxPairDistance;
+    }
+    if (numAtoms !=0 && (!atomListBuffer.isInitialized() || atomListBuffer.getSize() != numAtoms)) {
+        if (atomListBuffer.isInitialized())
+            atomListBuffer.resize(atomList.size());
+        else
+            atomListBuffer.initialize<int>(cc, atomList.size(), "atomListBuffer");
+        atomListBuffer.upload(atomList);
+    }
+    if (numPairs !=0 && (!pairListBuffer.isInitialized() || pairListBuffer.getSize() != numPairs)) {
+        if (pairListBuffer.isInitialized()) {
+            pairListBuffer.resize(pairList.size());
+            pairTemperatureBuffer.resize(pairList.size());
+        }
+        else {
+            pairListBuffer.initialize<mm_int2>(cc, pairList.size(), "pairListBuffer");
+            pairTemperatureBuffer.initialize<float>(cc, pairList.size(), "pairTemperatureBuffer");
+        }
+        std::vector<mm_int2> tmp;
+        std::vector<float> tmp2;
+        for(const auto &pair : pairList) {
+            tmp.push_back(mm_int2(std::get<0>(pair), std::get<1>(pair)));
+            tmp2.push_back(std::get<2>(pair));
+        }
+        pairListBuffer.upload(tmp);
+        pairTemperatureBuffer.upload(tmp2);
+    }
+
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+        kernel1->addArg(numAtoms);
+        kernel1->addArg(numPairs);
+        kernel1->addArg(paddedNumAtoms);
+        kernel1->addArg(cc.getIntegrationUtilities().getStepSize());
+        kernel1->addArg(cc.getPosq());
+        kernel1->addArg(cc.getVelm());
+        kernel1->addArg(cc.getLongForceBuffer());
+        kernel1->addArg(integration.getPosDelta());
+        kernel1->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
+        kernel1->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
+        if (cc.getUseMixedPrecision())
+            kernel1->addArg(cc.getPosqCorrection());
+        kernel2->addArg(numParticles);
+        kernel2->addArg(cc.getIntegrationUtilities().getStepSize());
+        kernel2->addArg(cc.getPosq());
+        kernel2->addArg(cc.getVelm());
+        kernel2->addArg(integration.getPosDelta());
+        if (cc.getUseMixedPrecision())
+            kernel2->addArg(cc.getPosqCorrection());
+        if (numPairs > 0) {
+            kernelHardWall->addArg(numPairs);
+            kernelHardWall->addArg(maxPairDistanceBuffer);
+            kernelHardWall->addArg(cc.getIntegrationUtilities().getStepSize());
+            kernelHardWall->addArg(cc.getPosq());
+            kernelHardWall->addArg(cc.getVelm());
+            kernelHardWall->addArg(pairListBuffer);
+            kernelHardWall->addArg(pairTemperatureBuffer);
+            if (cc.getUseMixedPrecision())
+                kernelHardWall->addArg(cc.getPosqCorrection());
+        }
+        kernel3->addArg(numAtoms);
+        kernel3->addArg(numPairs);
+        kernel3->addArg(paddedNumAtoms);
+        kernel3->addArg(cc.getIntegrationUtilities().getStepSize());
+        kernel3->addArg(cc.getPosq());
+        kernel3->addArg(cc.getVelm());
+        kernel3->addArg(cc.getLongForceBuffer());
+        kernel3->addArg(integration.getPosDelta());
+        kernel3->addArg(numAtoms > 0 ? atomListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
+        kernel3->addArg(numPairs > 0 ? pairListBuffer : cc.getEnergyBuffer()); // The array is not used if num == 0
+        if (cc.getUseMixedPrecision())
+            kernel3->addArg(cc.getPosqCorrection());
+    }
+
+    /*
+     * Carry out the integration
+     */
+
+    // Advance the velocities a half step
+    kernel1->execute(std::max(numAtoms, numPairs));
+    integration.applyConstraints(integrator.getConstraintTolerance());
+    // Advance particle positions a full step
+    kernel2->execute(numParticles);
+    // Make sure any Drude-like particles have not wandered too far from home
+    if (numPairs > 0) kernelHardWall->execute(numPairs);
+    integration.computeVirtualSites();
+    context.calcForcesAndEnergy(true, false);
+    forcesAreValid = true;
+    // Update velocities another half step
+    kernel3->execute(std::max(numAtoms, numPairs));
+    integration.applyVelocityConstraints(integrator.getConstraintTolerance());
+
+    cc.setTime(cc.getTime()+dt);
+    cc.setStepCount(cc.getStepCount()+1);
+    cc.reorderAtoms();
+}
+
+double CommonIntegrateVelocityVerletStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
+    return cc.getIntegrationUtilities().computeKineticEnergy(0);
+}
+
+void CommonNoseHooverChainKernel::initialize() {
+    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
+    map<string, string> defines;
+    int workGroupSize = std::min(cc.getMaxThreadBlockSize(), 512);
+    defines["WORK_GROUP_SIZE"] = std::to_string(workGroupSize);
+    defines["BEGIN_YS_LOOP"] = "const real arr[1] = {1.0}; for(int i=0;i<1;++i) { const real ys = arr[i];";
+    defines["END_YS_LOOP"] = "}";
+    ComputeProgram program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    propagateKernels[1] = program->createKernel("propagateNoseHooverChain");
+    defines["BEGIN_YS_LOOP"] = "const real arr[3] = {0.828981543588751, -0.657963087177502, 0.828981543588751}; for(int i=0;i<3;++i) { const real ys = arr[i];";
+    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    propagateKernels[3] = program->createKernel("propagateNoseHooverChain");
+    defines["BEGIN_YS_LOOP"] = "const real arr[5] = {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065}; for(int i=0;i<5;++i) { const real ys = arr[i];";
+    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    propagateKernels[5] = program->createKernel("propagateNoseHooverChain");
+    program = cc.compileProgram(CommonKernelSources::noseHooverChain, defines);
+    reduceEnergyKernel = program->createKernel("reduceEnergyPair");
+
+    computeHeatBathEnergyKernel = program->createKernel("computeHeatBathEnergy");
+    computeAtomsKineticEnergyKernel = program->createKernel("computeAtomsKineticEnergy");
+    computePairsKineticEnergyKernel = program->createKernel("computePairsKineticEnergy");
+    scaleAtomsVelocitiesKernel = program->createKernel("scaleAtomsVelocities");
+    scalePairsVelocitiesKernel = program->createKernel("scalePairsVelocities");
+    int energyBufferSize = cc.getEnergyBuffer().getSize();
+    if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision())
+        energyBuffer.initialize<mm_double2>(cc, energyBufferSize, "energyBuffer");
+    else
+        energyBuffer.initialize<mm_float2>(cc, energyBufferSize, "energyBuffer");
+}
+
+std::pair<double, double> CommonNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
+    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
+    int chainID = nhc.getChainID();
+    int nAtoms = nhc.getThermostatedAtoms().size();
+    int nPairs = nhc.getThermostatedPairs().size();
+    int chainLength = nhc.getChainLength();
+    int numYS = nhc.getNumYoshidaSuzukiTimeSteps();
+    int numMTS = nhc.getNumMultiTimeSteps();
+
+    if (numYS != 1 && numYS != 3 && numYS != 5) {
+        throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
+    }
+
+    auto & chainState = cc.getIntegrationUtilities().getNoseHooverChainState();
+
+    if (!scaleFactorBuffer.isInitialized() || scaleFactorBuffer.getSize() == 0) {
+        if (useDouble) {
+            std::vector<mm_double2> zeros{{0,0}};
+            if (scaleFactorBuffer.isInitialized())
+                scaleFactorBuffer.resize(1);
+            else
+                scaleFactorBuffer.initialize<mm_double2>(cc, 1, "scaleFactorBuffer");
+            scaleFactorBuffer.upload(zeros);
+        }
+        else {
+            std::vector<mm_float2> zeros{{0,0}};
+            if (scaleFactorBuffer.isInitialized())
+                scaleFactorBuffer.resize(1);
+            else
+                scaleFactorBuffer.initialize<mm_float2>(cc, 1, "scaleFactorBuffer");
+            scaleFactorBuffer.upload(zeros);
+        }
+    }
+    if (!chainForces.isInitialized() || !chainMasses.isInitialized()) {
+        if (useDouble) {
+            std::vector<double> zeros(chainLength,0);
+            if (chainForces.isInitialized()) {
+                chainMasses.resize(chainLength);
+                chainForces.resize(chainLength);
+            }
+            else {
+                chainMasses.initialize<double>(cc, chainLength, "chainMasses");
+                chainForces.initialize<double>(cc, chainLength, "chainForces");
+            }
+            chainMasses.upload(zeros);
+            chainForces.upload(zeros);
+        }
+        else {
+            std::vector<float> zeros(chainLength,0);
+            if (chainForces.isInitialized()) {
+                chainMasses.resize(chainLength);
+                chainForces.resize(chainLength);
+            }
+            else {
+                chainMasses.initialize<float>(cc, chainLength, "chainMasses");
+                chainForces.initialize<float>(cc, chainLength, "chainForces");
+            }
+            chainMasses.upload(zeros);
+            chainForces.upload(zeros);
+        }
+    }
+    if (chainForces.getSize() < chainLength)
+        chainMasses.resize(chainLength);
+    if (chainMasses.getSize() < chainLength)
+        chainMasses.resize(chainLength);
+
+
+    // N.B. We ignore the incoming kineticEnergy and grab it from the device buffer instead
+    if (nAtoms) {
+        if (!chainState.count(2*chainID))
+            chainState[2*chainID] = ComputeArray();
+        if (!chainState.at(2*chainID).isInitialized() || chainState.at(2*chainID).getSize() != chainLength) {
+            // We need to upload the Common array
+            if (useDouble) {
+                if (chainState.at(2*chainID).isInitialized())
+                    chainState.at(2*chainID).resize(chainLength);
+                else
+                    chainState.at(2*chainID).initialize<mm_double2>(cc, chainLength, "chainState" + std::to_string(2*chainID));
+                std::vector<mm_double2> zeros(chainLength, mm_double2(0.0, 0.0));
+                chainState.at(2*chainID).upload(zeros.data());
+            }
+            else {
+                if (chainState.at(2*chainID).isInitialized())
+                    chainState.at(2*chainID).resize(chainLength);
+                else
+                    chainState.at(2*chainID).initialize<mm_float2>(cc, chainLength, "chainState" + std::to_string(2*chainID));
+                std::vector<mm_float2> zeros(chainLength, mm_float2(0.0f, 0.0f));
+                chainState.at(2*chainID).upload(zeros.data());
+            }
+        }
+    }
+
+    if (nPairs) {
+        if (!chainState.count(2*chainID+1))
+            chainState[2*chainID+1] = ComputeArray();
+        if (!chainState.at(2*chainID+1).isInitialized() || chainState.at(2*chainID+1).getSize() != chainLength) {
+            // We need to upload the Common array
+            if (useDouble) {
+                if (chainState.at(2*chainID+1).isInitialized())
+                    chainState.at(2*chainID+1).resize(chainLength);
+                else
+                    chainState.at(2*chainID+1).initialize<mm_double2>(cc, chainLength, "chainState" + std::to_string(2*chainID+1));
+                std::vector<mm_double2> zeros(chainLength, mm_double2(0.0, 0.0));
+                chainState.at(2*chainID+1).upload(zeros.data());
+            }
+            else {
+                if (chainState.at(2*chainID+1).isInitialized())
+                    chainState.at(2*chainID+1).resize(chainLength);
+                else
+                    chainState.at(2*chainID+1).initialize<mm_float2>(cc, chainLength, "chainState" + std::to_string(2*chainID+1));
+                std::vector<mm_float2> zeros(chainLength, mm_float2(0.0f, 0.0f));
+                chainState.at(2*chainID+1).upload(zeros.data());
+            }
+        }
+    }
+
+    if (!hasInitializedPropagateKernel) {
+        hasInitializedPropagateKernel = true;
+        propagateKernels[numYS]->addArg(chainState[2*chainID]);
+        propagateKernels[numYS]->addArg(kineticEnergyBuffer);
+        propagateKernels[numYS]->addArg(scaleFactorBuffer);
+        propagateKernels[numYS]->addArg(chainMasses);
+        propagateKernels[numYS]->addArg(chainForces);
+        propagateKernels[numYS]->addArg(); // ChainType
+        propagateKernels[numYS]->addArg(chainLength);
+        propagateKernels[numYS]->addArg(numMTS);
+        propagateKernels[numYS]->addArg(); // numDoFs
+        propagateKernels[numYS]->addArg((float)timeStep);
+        propagateKernels[numYS]->addArg(); // kT
+        propagateKernels[numYS]->addArg(); // frequency
+    }
+
+    if (nAtoms) {
+        int chainType = 0;
+        double temperature = nhc.getTemperature();
+        float frequency = nhc.getCollisionFrequency();
+        double kT = BOLTZ * temperature;
+        int numDOFs = nhc.getNumDegreesOfFreedom();
+        propagateKernels[numYS]->setArg(5, chainType);
+        propagateKernels[numYS]->setArg(8, numDOFs);
+        propagateKernels[numYS]->setArg(10, useDouble ? kT : (float)kT);
+        propagateKernels[numYS]->setArg(11, frequency);
+        propagateKernels[numYS]->execute(1, 1);
+    }
+    if (nPairs) {
+        int chainType = 1;
+        double relativeTemperature = nhc.getRelativeTemperature();
+        float relativeFrequency = nhc.getRelativeCollisionFrequency();
+        double kT = BOLTZ * relativeTemperature;
+        int ndf = 3*nPairs;
+        propagateKernels[numYS]->setArg(5, chainType);
+        propagateKernels[numYS]->setArg(8, ndf);
+        propagateKernels[numYS]->setArg(10, useDouble ? kT : (float)kT);
+        propagateKernels[numYS]->setArg(11, relativeFrequency);
+        propagateKernels[numYS]->execute(1, 1);
+    }
+    return {0, 0};
+}
+
+double CommonNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
+
+    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
+
+    int chainID = nhc.getChainID();
+    int chainLength = nhc.getChainLength();
+
+    auto & chainState = cc.getIntegrationUtilities().getNoseHooverChainState();
+
+    bool absChainIsValid = chainState.count(2*chainID) != 0 &&
+                           chainState[2*chainID].isInitialized() &&
+                           chainState[2*chainID].getSize() == chainLength;
+    bool relChainIsValid = chainState.count(2*chainID+1) != 0 &&
+                           chainState[2*chainID+1].isInitialized() &&
+                           chainState[2*chainID+1].getSize() == chainLength;
+
+    if (!absChainIsValid && !relChainIsValid) return 0.0;
+
+    if (!heatBathEnergy.isInitialized() || heatBathEnergy.getSize() == 0) {
+        if (useDouble) {
+            std::vector<double> one(1);
+            heatBathEnergy.initialize<double>(cc, 1, "heatBathEnergy");
+            heatBathEnergy.upload(one);
+        }
+        else {
+            std::vector<float> one(1);
+            heatBathEnergy.initialize<float>(cc, 1, "heatBathEnergy");
+            heatBathEnergy.upload(one);
+        }
+    }
+
+    cc.clearBuffer(heatBathEnergy);
+
+    if(!hasInitializedHeatBathEnergyKernel) {
+        hasInitializedHeatBathEnergyKernel = true;
+        computeHeatBathEnergyKernel->addArg(heatBathEnergy);
+        computeHeatBathEnergyKernel->addArg(chainLength);
+        computeHeatBathEnergyKernel->addArg(); // numDOFs
+        computeHeatBathEnergyKernel->addArg(); // kT
+        computeHeatBathEnergyKernel->addArg(); // frequency
+        computeHeatBathEnergyKernel->addArg(); // chainstate
+    }
+
+    if (absChainIsValid) {
+        int numDOFs = nhc.getNumDegreesOfFreedom();
+        double temperature = nhc.getTemperature();
+        float frequency = nhc.getCollisionFrequency();
+        double kT = BOLTZ * temperature;
+
+        computeHeatBathEnergyKernel->setArg(2, numDOFs);
+        computeHeatBathEnergyKernel->setArg(3, useDouble ? kT : (float)kT);
+        computeHeatBathEnergyKernel->setArg(4, frequency);
+        computeHeatBathEnergyKernel->setArg(5, chainState[2*chainID]);
+        computeHeatBathEnergyKernel->execute(1, 1);
+    }
+    if (relChainIsValid) {
+        int numDOFs = 3 * nhc.getThermostatedPairs().size();
+        double temperature = nhc.getRelativeTemperature();
+        float frequency = nhc.getRelativeCollisionFrequency();
+        double kT = BOLTZ * temperature;
+
+        computeHeatBathEnergyKernel->setArg(2, numDOFs);
+        computeHeatBathEnergyKernel->setArg(3, useDouble ? kT : (float)kT);
+        computeHeatBathEnergyKernel->setArg(4, frequency);
+        computeHeatBathEnergyKernel->setArg(5, chainState[2*chainID+1]);
+        computeHeatBathEnergyKernel->execute(1, 1);
+    }
+
+
+    void * pinnedBuffer = cc.getPinnedBuffer();
+    heatBathEnergy.download(pinnedBuffer);
+    if (useDouble)
+        return *((double*) pinnedBuffer);
+    else
+        return *((float*) pinnedBuffer);
+}
+
+std::pair<double, double> CommonNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
+
+    bool useDouble = cc.getUseDoublePrecision() || cc.getUseMixedPrecision();
+
+    int chainID = nhc.getChainID();
+    const auto & nhcAtoms = nhc.getThermostatedAtoms();
+    const auto & nhcPairs = nhc.getThermostatedPairs();
+    int nAtoms = nhcAtoms.size();
+    int nPairs = nhcPairs.size();
+    if (nAtoms) {
+        if (!atomlists.count(chainID)) {
+            // We need to upload the Common array
+            atomlists[chainID] = ComputeArray();
+            atomlists[chainID].initialize<int>(cc, nAtoms, "atomlist" + std::to_string(chainID));
+            atomlists[chainID].upload(nhcAtoms);
+        }
+        if (atomlists[chainID].getSize() != nAtoms) {
+            throw OpenMMException("Number of atoms changed. Cannot be handled by the same Nose-Hoover thermostat.");
+        }
+    }
+    if (nPairs) {
+        if (!pairlists.count(chainID)) {
+            // We need to upload the Common array
+            pairlists[chainID] = ComputeArray();
+            pairlists[chainID].initialize<mm_int2>(cc, nPairs, "pairlist" + std::to_string(chainID));
+            std::vector<mm_int2> int2vec;
+            for(const auto &p : nhcPairs) int2vec.push_back(mm_int2(p.first, p.second));
+            pairlists[chainID].upload(int2vec);
+        }
+        if (pairlists[chainID].getSize() != nPairs) {
+            throw OpenMMException("Number of thermostated pairs changed. Cannot be handled by the same Nose-Hoover thermostat.");
+        }
+    }
+    if (!kineticEnergyBuffer.isInitialized() || kineticEnergyBuffer.getSize() == 0) {
+        if (useDouble) {
+            std::vector<mm_double2> zeros{{0,0}};
+            kineticEnergyBuffer.initialize<mm_double2>(cc, 1, "kineticEnergyBuffer");
+            kineticEnergyBuffer.upload(zeros);
+        }
+        else {
+            std::vector<mm_float2> zeros{{0,0}};
+            kineticEnergyBuffer.initialize<mm_float2>(cc, 1, "kineticEnergyBuffer");
+            kineticEnergyBuffer.upload(zeros);
+        }
+    }
+
+    int workGroupSize = std::min(cc.getMaxThreadBlockSize(), 512);
+    if (!hasInitializedKineticEnergyKernel) {
+        hasInitializedKineticEnergyKernel = true;
+        computeAtomsKineticEnergyKernel->addArg(energyBuffer);
+        computeAtomsKineticEnergyKernel->addArg(); // nAtoms/nPairs
+        computeAtomsKineticEnergyKernel->addArg(cc.getVelm());
+        computeAtomsKineticEnergyKernel->addArg(); // pair/atom list
+
+        reduceEnergyKernel->addArg(energyBuffer);
+        reduceEnergyKernel->addArg(kineticEnergyBuffer);
+        reduceEnergyKernel->addArg(energyBuffer.getSize());
+    }
+
+    cc.clearBuffer(energyBuffer);
+    if (nAtoms) {
+        computeAtomsKineticEnergyKernel->setArg(1, nAtoms);
+        computeAtomsKineticEnergyKernel->setArg(3, atomlists[chainID]);
+        computeAtomsKineticEnergyKernel->execute(nAtoms);
+    }
+    if (nPairs) {
+        computePairsKineticEnergyKernel->setArg(1, nPairs);
+        computePairsKineticEnergyKernel->setArg(3, pairlists[chainID]);
+        computePairsKineticEnergyKernel->execute(nPairs);
+    }
+    reduceEnergyKernel->execute(workGroupSize, workGroupSize);
+
+    std::pair<double, double> KEs = {0, 0};
+    if (downloadValue) {
+        if (useDouble) {
+            mm_double2 tmp;
+            kineticEnergyBuffer.download(&tmp);
+            KEs.first = tmp.x;
+            KEs.second = tmp.y;
+        }
+        else {
+            mm_float2 tmp;
+            kineticEnergyBuffer.download(&tmp);
+            KEs.first = tmp.x;
+            KEs.second = tmp.y;
+        }
+    }
+    return KEs;
+}
+
+void CommonNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
+    // For now we assume that the atoms and pairs info is valid, because compute{Atoms|Pairs}KineticEnergy must have been
+    // called before this kernel.  If that ever ceases to be true, some sanity checks are needed here.
+
+    int chainID = nhc.getChainID();
+    int nAtoms = nhc.getThermostatedAtoms().size();
+    int nPairs = nhc.getThermostatedPairs().size();
+    if (!hasInitializedScaleVelocitiesKernel) {
+        hasInitializedScaleVelocitiesKernel = true;
+        scaleAtomsVelocitiesKernel->addArg(scaleFactorBuffer);
+        scaleAtomsVelocitiesKernel->addArg(); // nAtoms/nPairs
+        scaleAtomsVelocitiesKernel->addArg(cc.getVelm());
+        scaleAtomsVelocitiesKernel->addArg(); // atom/pair list
+    }
+    if (nAtoms) {
+        scaleAtomsVelocitiesKernel->setArg(1, nAtoms);
+        scaleAtomsVelocitiesKernel->setArg(3, atomlists[chainID]);
+        scaleAtomsVelocitiesKernel->execute(nAtoms);
+    }
+    if (nPairs) {
+        scalePairsVelocitiesKernel->setArg(1, nPairs);
+        scalePairsVelocitiesKernel->setArg(3, pairlists[chainID]);
+        scalePairsVelocitiesKernel->execute(nPairs);
+    }
+}
+
 void CommonIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
    cc.initializeContexts();
    cc.setAsCurrent();

--- a/platforms/common/src/kernels/noseHooverChain.cc
+++ b/platforms/common/src/kernels/noseHooverChain.cc
+KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL const mixed2 * RESTRICT energySum, GLOBAL mixed2* RESTRICT scaleFactor,
+                                     GLOBAL mixed* RESTRICT chainMasses, GLOBAL mixed* RESTRICT chainForces, int chainType, int chainLength, int numMTS,
+                                     int numDOFs, float timeStep, mixed kT, float frequency){
+    const mixed kineticEnergy = chainType == 0 ? energySum[0].x : energySum[0].y;
+    mixed scale = 1;
+    if(kineticEnergy < 1e-8) return;
+    for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
+    chainMasses[0] *= numDOFs;
+    mixed KE2 = 2.0f * kineticEnergy;
+    mixed timeOverMTS = timeStep / numMTS;
+    chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
+    for (int bead = 0; bead < chainLength - 1; ++bead) {
+        chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+    }
+    for (int mts = 0; mts < numMTS; ++mts) {
+        BEGIN_YS_LOOP
+            mixed wdt = ys * timeOverMTS;
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+            for (int bead = chainLength - 2; bead >= 0; --bead) {
+                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
+            }
+            // update particle velocities
+            mixed aa = EXP(-0.5f * wdt * chainData[0].y);
+            scale *= aa;
+            // update the thermostat positions
+            for (int bead = 0; bead < chainLength; ++bead) {
+                chainData[bead].x += 0.5f * chainData[bead].y * wdt;
+            }
+            // update the forces
+            chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
+            // update thermostat velocities
+            for (int bead = 0; bead < chainLength - 1; ++bead) {
+                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
+                chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+            }
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+        END_YS_LOOP
+    } // MTS loop
+
+    if (chainType == 0) {
+        scaleFactor[0].x = scale;
+    } else {
+        scaleFactor[0].y = scale;
+    }
+}
+
+
+/**
+ * Compute total (potential + kinetic) energy of the Nose-Hoover beads
+ */
+KERNEL void computeHeatBathEnergy(GLOBAL mixed* RESTRICT heatBathEnergy, int chainLength, int numDOFs,
+                                  mixed kT, float frequency, GLOBAL const mixed2* RESTRICT chainData){
+    // Note that this is always incremented; make sure it's zeroed properly before the first call
+    for(int i = 0; i < chainLength; ++i) {
+        mixed prefac = i ? 1 : numDOFs;
+        mixed mass = prefac * kT / (frequency * frequency);
+        mixed velocity = chainData[i].y; 
+        // The kinetic energy of this bead
+        heatBathEnergy[0] += 0.5f * mass * velocity * velocity;
+        // The potential energy of this bead
+        mixed position = chainData[i].x;
+        heatBathEnergy[0] += prefac * kT * position;
+    }
+}
+
+KERNEL void computeAtomsKineticEnergy(GLOBAL mixed2 * RESTRICT energyBuffer, int numAtoms,
+                                      GLOBAL const mixed4* RESTRICT velm, GLOBAL const int *RESTRICT atoms){
+    mixed2 energy = make_mixed2(0,0);
+    int index = GLOBAL_ID;
+    while (index < numAtoms){
+        int atom = atoms[index];
+        mixed4 v = velm[atom];
+        mixed mass = v.w == 0 ? 0 : 1 / v.w;
+        energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
+        index += GLOBAL_SIZE;
+    }
+    energyBuffer[GLOBAL_ID] = energy;
+}
+
+KERNEL void computePairsKineticEnergy(GLOBAL mixed2 * RESTRICT energyBuffer, int numPairs,
+                                      GLOBAL const mixed4* RESTRICT velm, GLOBAL const int2 *RESTRICT pairs){
+    mixed2 energy = make_mixed2(0,0);
+    int index = GLOBAL_ID;
+    while (index < numPairs){
+        int2 pair = pairs[index];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
+        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
+        mixed4 cv;
+        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
+        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
+        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
+        mixed4 rv;
+        rv.x = v2.x - v1.x;
+        rv.y = v2.y - v1.y;
+        rv.z = v2.z - v1.z;
+        energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
+        energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
+        index += GLOBAL_SIZE;
+    }
+    // The atoms version of this has been called already, so accumulate instead of assigning here
+    energyBuffer[GLOBAL_ID].x += energy.x;
+    energyBuffer[GLOBAL_ID].y += energy.y;
+}
+
+KERNEL void scaleAtomsVelocities(GLOBAL mixed2* RESTRICT scaleFactor, int numAtoms,
+                                   GLOBAL mixed4* RESTRICT velm, GLOBAL const int *RESTRICT atoms){
+    const mixed scale = scaleFactor[0].x;
+    int index = GLOBAL_ID;
+    while (index < numAtoms){
+        int atom = atoms[index];
+        velm[atom].x *= scale;
+        velm[atom].y *= scale;
+        velm[atom].z *= scale;
+        index += GLOBAL_SIZE;
+    }
+}
+
+KERNEL void scalePairsVelocities(GLOBAL mixed2 * RESTRICT scaleFactor, int numPairs,
+                                 GLOBAL mixed4* RESTRICT velm, GLOBAL const int2 *RESTRICT pairs){
+    int index = GLOBAL_ID;
+    mixed comScale = scaleFactor[0].x;
+    mixed relScale = scaleFactor[0].y;
+    while (index < numPairs){
+        int atom1 = pairs[index].x;
+        int atom2 = pairs[index].y;
+        mixed m1 = velm[atom1].w == 0 ? 0 : 1 / velm[atom1].w;
+        mixed m2 = velm[atom2].w == 0 ? 0 : 1 / velm[atom2].w;
+        mixed4 cv;
+        cv.x = (m1*velm[atom1].x + m2*velm[atom2].x) / (m1 + m2);
+        cv.y = (m1*velm[atom1].y + m2*velm[atom2].y) / (m1 + m2);
+        cv.z = (m1*velm[atom1].z + m2*velm[atom2].z) / (m1 + m2);
+        mixed4 rv;
+        rv.x = velm[atom2].x - velm[atom1].x;
+        rv.y = velm[atom2].y - velm[atom1].y;
+        rv.z = velm[atom2].z - velm[atom1].z;
+        velm[atom1].x = comScale * cv.x - relScale * rv.x * m2 / (m1 + m2);
+        velm[atom1].y = comScale * cv.y - relScale * rv.y * m2 / (m1 + m2);
+        velm[atom1].z = comScale * cv.z - relScale * rv.z * m2 / (m1 + m2);
+        velm[atom2].x = comScale * cv.x + relScale * rv.x * m1 / (m1 + m2);
+        velm[atom2].y = comScale * cv.y + relScale * rv.y * m1 / (m1 + m2);
+        velm[atom2].z = comScale * cv.z + relScale * rv.z * m1 / (m1 + m2);
+        index += GLOBAL_SIZE;
+    }
+}
+
+/**
+ * Sum the energy buffer containing a pair of energies stored as mixed2.  This is taken from the analogous customIntegrator code
+ */
+KERNEL void reduceEnergyPair(GLOBAL const mixed2* RESTRICT sumBuffer, GLOBAL mixed2* result, int bufferSize) {
+    LOCAL mixed2 tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = LOCAL_ID;
+    mixed2 sum = make_mixed2(0,0);
+    for (unsigned int index = thread; index < bufferSize; index += LOCAL_SIZE) {
+        sum.x += sumBuffer[index].x;
+        sum.y += sumBuffer[index].y;
+    }
+    tempBuffer[thread].x = sum.x;
+    tempBuffer[thread].y = sum.y;
+    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
+        SYNC_THREADS;
+        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE) {
+            tempBuffer[thread].x += tempBuffer[thread+i].x;
+            tempBuffer[thread].y += tempBuffer[thread+i].y;
+        }
+    }
+    if (thread == 0)
+        *result = tempBuffer[0];
+}
--- a/platforms/common/src/kernels/velocityVerlet.cc
+++ b/platforms/common/src/kernels/velocityVerlet.cc
+/**
+ * Perform the first step of Velocity Verlet integration.
+ */
+
+KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL const mixed2* RESTRICT dt, GLOBAL const real4* RESTRICT posq,
+                                         GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta,
+                                         GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList
+#ifdef USE_MIXED_PRECISION
+                                        ,GLOBAL const real4* RESTRICT posqCorrection
+#endif
+    ){
+    const mixed2 stepSize = dt[0];
+    const mixed dtPos = stepSize.y;
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const mixed scale = 0.5f * dtVel/(mixed) 0x100000000;
+    int index = GLOBAL_ID;
+    while (index < numAtoms) {
+        int atom = atomList[index];
+        mixed4 velocity = velm[atom];
+        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[atom];
+            real4 pos2 = posqCorrection[atom];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[atom];
+#endif
+            velocity.x += scale*force[atom]*velocity.w;
+            velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w;
+            velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w;
+            pos.x = velocity.x*dtPos;
+            pos.y = velocity.y*dtPos;
+            pos.z = velocity.z*dtPos;
+            posDelta[atom] = pos;
+            velm[atom] = velocity;
+        }
+        index += GLOBAL_SIZE;
+    }
+    index = GLOBAL_ID;
+    while (index < numPairs){
+        int atom1 = pairList[index].x;
+        int atom2 = pairList[index].y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
+        mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
+        mixed mass1fract = m1 / (m1 + m2);
+        mixed mass2fract = m2 / (m1 + m2);
+        mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
+        mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
+        mixed3 comVel;
+        comVel.x= v1.x*mass1fract + v2.x*mass2fract;
+        comVel.y= v1.y*mass1fract + v2.y*mass2fract;
+        comVel.z= v1.z*mass1fract + v2.z*mass2fract;
+        mixed3 relVel;
+        relVel.x= v2.x - v1.x;
+        relVel.y= v2.y - v1.y;
+        relVel.z= v2.z - v1.z;
+
+        mixed3 comFrc;
+        mixed F1x = scale*force[atom1];
+        mixed F1y = scale*force[atom1+paddedNumAtoms];
+        mixed F1z = scale*force[atom1+paddedNumAtoms*2];
+        mixed F2x = scale*force[atom2];
+        mixed F2y = scale*force[atom2+paddedNumAtoms];
+        mixed F2z = scale*force[atom2+paddedNumAtoms*2];
+        comFrc.x = F1x + F2x;
+        comFrc.y = F1y + F2y;
+        comFrc.z = F1z + F2z;
+        mixed3 relFrc;
+        relFrc.x = mass1fract*F2x - mass2fract*F1x;
+        relFrc.y = mass1fract*F2y - mass2fract*F1y;
+        relFrc.z = mass1fract*F2z - mass2fract*F1z;
+        comVel.x += comFrc.x * invTotMass;
+        comVel.y += comFrc.y * invTotMass;
+        comVel.z += comFrc.z * invTotMass;
+        relVel.x += relFrc.x * invRedMass;
+        relVel.y += relFrc.y * invRedMass;
+        relVel.z += relFrc.z * invRedMass;
+#ifdef USE_MIXED_PRECISION
+        real4 posv1 = posq[atom1];
+        real4 posv2 = posq[atom2];
+        real4 posc1 = posqCorrection[atom1];
+        real4 posc2 = posqCorrection[atom2];
+        mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
+        mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
+#else
+        real4 pos1 = posq[atom1];
+        real4 pos2 = posq[atom2];
+#endif
+        if (v1.w != 0.0f) {
+            v1.x = comVel.x - relVel.x*mass2fract;
+            v1.y = comVel.y - relVel.y*mass2fract;
+            v1.z = comVel.z - relVel.z*mass2fract;
+            pos1.x = v1.x*dtPos;
+            pos1.y = v1.y*dtPos;
+            pos1.z = v1.z*dtPos;
+            posDelta[atom1] = pos1;
+            velm[atom1] = v1;
+        }
+        if (v2.w != 0.0f) {
+            v2.x = comVel.x + relVel.x*mass1fract;
+            v2.y = comVel.y + relVel.y*mass1fract;
+            v2.z = comVel.z + relVel.z*mass1fract;
+            pos2.x = v2.x*dtPos;
+            pos2.y = v2.y*dtPos;
+            pos2.z = v2.z*dtPos;
+            posDelta[atom2] = pos2;
+            velm[atom2] = v2;
+        }
+        index += GLOBAL_SIZE;
+     }
+}
+
+/**
+ * Perform the second step of Velocity Verlet integration.
+ */
+
+KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm,
+                                         GLOBAL const mixed4* RESTRICT posDelta
+#ifdef USE_MIXED_PRECISION
+                                        ,GLOBAL real4* RESTRICT posqCorrection
+#endif
+    ){
+    mixed2 stepSize = dt[0];
+    int index = GLOBAL_ID;
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    while(index < numAtoms) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            mixed4 delta = posDelta[index];
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+        }
+        index += GLOBAL_SIZE;
+    }
+}
+
+/**
+ * Perform the third step of Velocity Verlet integration.
+ */
+
+KERNEL void integrateVelocityVerletPart3(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
+                                         GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed4* RESTRICT posDelta,
+                                         GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList
+#ifdef USE_MIXED_PRECISION
+                                        ,GLOBAL const real4* RESTRICT posqCorrection
+#endif
+    ){
+    mixed2 stepSize = dt[0];
+#ifndef SUPPORTS_DOUBLE_PRECISION
+    double oneOverDt = 1.0/stepSize.y;
+#else
+    float oneOverDt = 1.0f/stepSize.y;
+    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
+#endif
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
+    int index = GLOBAL_ID;
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    while(index < numAtoms) {
+        int atom = atomList[index];
+        mixed4 velocity = velm[atom];
+        if (velocity.w != 0.0) {
+            mixed4 deltaXconstrained = posDelta[atom];
+            velocity.x += scale*force[atom]*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
+            velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
+            velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
+#ifdef SUPPORTS_DOUBLE_PRECISION
+            velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
+            velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
+            velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
+#endif
+            velm[atom] = velocity;
+        }
+        index += GLOBAL_SIZE;
+    }
+    index = GLOBAL_ID;
+    while(index < numPairs) {
+        int atom1 = pairList[index].x;
+        int atom2 = pairList[index].y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
+        mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
+        mixed mass1fract = m1 / (m1 + m2);
+        mixed mass2fract = m2 / (m1 + m2);
+        mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
+        mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
+        mixed3 comVel;
+        comVel.x= v1.x*mass1fract + v2.x*mass2fract;
+        comVel.y= v1.y*mass1fract + v2.y*mass2fract;
+        comVel.z= v1.z*mass1fract + v2.z*mass2fract;
+        mixed3 relVel;
+        relVel.x= v2.x - v1.x;
+        relVel.y= v2.y - v1.y;
+        relVel.z= v2.z - v1.z;
+
+        mixed3 comFrc;
+        mixed F1x = scale*force[atom1];
+        mixed F1y = scale*force[atom1+paddedNumAtoms];
+        mixed F1z = scale*force[atom1+paddedNumAtoms*2];
+        mixed F2x = scale*force[atom2];
+        mixed F2y = scale*force[atom2+paddedNumAtoms];
+        mixed F2z = scale*force[atom2+paddedNumAtoms*2];
+        comFrc.x = F1x + F2x;
+        comFrc.y = F1y + F2y;
+        comFrc.z = F1z + F2z;
+        mixed3 relFrc;
+        relFrc.x = mass1fract*F2x - mass2fract*F1x;
+        relFrc.y = mass1fract*F2y - mass2fract*F1y;
+        relFrc.z = mass1fract*F2z - mass2fract*F1z;
+        comVel.x += comFrc.x * invTotMass;
+        comVel.y += comFrc.y * invTotMass;
+        comVel.z += comFrc.z * invTotMass;
+        relVel.x += relFrc.x * invRedMass;
+        relVel.y += relFrc.y * invRedMass;
+        relVel.z += relFrc.z * invRedMass;
+        if (v1.w != 0.0f) {
+            mixed4 deltaXconstrained = posDelta[atom1];
+            v1.x = comVel.x - relVel.x*mass2fract + (deltaXconstrained.x - v1.x*stepSize.y)*oneOverDt;
+            v1.y = comVel.y - relVel.y*mass2fract + (deltaXconstrained.y - v1.y*stepSize.y)*oneOverDt;
+            v1.z = comVel.z - relVel.z*mass2fract + (deltaXconstrained.z - v1.z*stepSize.y)*oneOverDt;
+#ifdef SUPPORTS_DOUBLE_PRECISION
+            v1.x += (deltaXconstrained.x - v1.x*stepSize.y)*correction;
+            v1.y += (deltaXconstrained.y - v1.y*stepSize.y)*correction;
+            v1.z += (deltaXconstrained.z - v1.z*stepSize.y)*correction;
+#endif
+            velm[atom1] = v1;
+        }
+        if (v2.w != 0.0f) {
+            mixed4 deltaXconstrained = posDelta[atom2];
+            v2.x = comVel.x + relVel.x*mass1fract + (deltaXconstrained.x - v2.x*stepSize.y)*oneOverDt;
+            v2.y = comVel.y + relVel.y*mass1fract + (deltaXconstrained.y - v2.y*stepSize.y)*oneOverDt;
+            v2.z = comVel.z + relVel.z*mass1fract + (deltaXconstrained.z - v2.z*stepSize.y)*oneOverDt;
+#ifdef SUPPORTS_DOUBLE_PRECISION
+            v2.x += (deltaXconstrained.x - v2.x*stepSize.y)*correction;
+            v2.y += (deltaXconstrained.y - v2.y*stepSize.y)*correction;
+            v2.z += (deltaXconstrained.z - v2.z*stepSize.y)*correction;
+#endif
+            velm[atom2] = v2;
+        }
+        index += GLOBAL_SIZE;
+    }
+}
+
+KERNEL void integrateVelocityVerletHardWall(int numPairs, GLOBAL const float* RESTRICT maxPairDistance, 
+                                            GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
+                                            GLOBAL mixed4* RESTRICT velm, GLOBAL const int2* RESTRICT pairList,
+                                            GLOBAL const float* RESTRICT pairTemperature
+#ifdef USE_MIXED_PRECISION
+                                           ,GLOBAL real4* RESTRICT posqCorrection
+#endif
+    ){
+
+    mixed dtPos = dt[0].y;
+    mixed maxDelta = (mixed) maxPairDistance[0];
+    if (maxDelta > 0){
+        int index = GLOBAL_ID;
+        while(index < numPairs) {
+
+            const mixed hardWallScale = sqrt( ((mixed) pairTemperature[index]) * ((mixed) BOLTZ));
+            int atom1 = pairList[index].x;
+            int atom2 = pairList[index].y;
+#ifdef USE_MIXED_PRECISION
+            real4 posv1 = posq[atom1];
+            real4 posc1 = posqCorrection[atom1];
+            mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
+            real4 posv2 = posq[atom2];
+            real4 posc2 = posqCorrection[atom2];
+            mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
+#else
+            real4 pos1 = posq[atom1];
+            real4 pos2 = posq[atom2];
+#endif
+            mixed3 delta = make_mixed3(pos1.x - pos2.x, pos1.y - pos2.y, pos1.z - pos2.z);
+            mixed r = sqrt(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+            mixed rInv = 1/r;
+            if (rInv*maxDelta < 1.0) {
+                // The constraint has been violated, so make the inter-particle distance "bounce"
+                // off the hard wall.
+                mixed3 bondDir = make_mixed3(delta.x * rInv, delta.y * rInv, delta.z * rInv);
+                mixed3 vel1 = make_mixed3(velm[atom1].x, velm[atom1].y, velm[atom1].z);
+                mixed3 vel2 = make_mixed3(velm[atom2].x, velm[atom2].y, velm[atom2].z);
+                mixed m1 = velm[atom1].w != 0.0 ? 1.0/velm[atom1].w : 0.0;
+                mixed m2 = velm[atom2].w != 0.0 ? 1.0/velm[atom2].w : 0.0;
+                mixed invTotMass = (m1 + m2 != 0.0) ? 1.0 /(m1 + m2) : 0.0;
+                mixed deltaR = r-maxDelta;
+                mixed deltaT = dtPos;
+                mixed dt = dtPos;
+
+                mixed dotvr1 = vel1.x*bondDir.x + vel1.y*bondDir.y + vel1.z*bondDir.z;
+                mixed3 vb1 = make_mixed3(bondDir.x*dotvr1, bondDir.y*dotvr1, bondDir.z*dotvr1);
+                mixed3 vp1 = make_mixed3(vel1.x-vb1.x, vel1.y-vb1.y, vel1.z-vb1.z);
+                if (m2 == 0) {
+                    // The parent particle is massless, so move only the Drude particle.
+
+                    if (dotvr1 != 0.0)
+                        deltaT = deltaR/fabs(dotvr1);
+                    if (deltaT > dtPos)
+                        deltaT = dtPos;
+                    dotvr1 = -dotvr1*hardWallScale/(fabs(dotvr1)*sqrt(m1));
+                    mixed dr = -deltaR + deltaT*dotvr1;
+                    pos1.x += bondDir.x*dr;
+                    pos1.y += bondDir.y*dr;
+                    pos1.z += bondDir.z*dr;
+                    velm[atom1] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom1].w);
+#ifdef USE_MIXED_PRECISION
+                    posq[atom1] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
+                    posqCorrection[atom1] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
+#else
+                    posq[atom1] = pos1;
+#endif
+                }
+                else {
+                    // Move both particles.
+                    mixed dotvr2 = vel2.x*bondDir.x + vel2.y*bondDir.y + vel2.z*bondDir.z;
+                    mixed3 vb2 = make_mixed3(bondDir.x*dotvr2, bondDir.y*dotvr2, bondDir.z*dotvr2);
+                    mixed3 vp2 = make_mixed3(vel2.x-vb2.x, vel2.y-vb2.y, vel2.z-vb2.z);
+                    mixed vbCMass = (m1*dotvr1 + m2*dotvr2)*invTotMass;
+                    dotvr1 -= vbCMass;
+                    dotvr2 -= vbCMass;
+                    if (dotvr1 != dotvr2)
+                        deltaT = deltaR/fabs(dotvr1-dotvr2);
+                    if (deltaT > dt)
+                        deltaT = dt;
+                    mixed vBond = hardWallScale/sqrt(m1);
+                    dotvr1 = -dotvr1*vBond*m2*invTotMass/fabs(dotvr1);
+                    dotvr2 = -dotvr2*vBond*m1*invTotMass/fabs(dotvr2);
+                    mixed dr1 = -deltaR*m2*invTotMass + deltaT*dotvr1;
+                    mixed dr2 = deltaR*m1*invTotMass + deltaT*dotvr2;
+                    dotvr1 += vbCMass;
+                    dotvr2 += vbCMass;
+                    pos1.x += bondDir.x*dr1;
+                    pos1.y += bondDir.y*dr1;
+                    pos1.z += bondDir.z*dr1;
+                    pos2.x += bondDir.x*dr2;
+                    pos2.y += bondDir.y*dr2;
+                    pos2.z += bondDir.z*dr2;
+                    velm[atom1] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom1].w);
+                    velm[atom2] = make_mixed4(vp2.x + bondDir.x*dotvr2, vp2.y + bondDir.y*dotvr2, vp2.z + bondDir.z*dotvr2, velm[atom2].w);
+#ifdef USE_MIXED_PRECISION
+                    posq[atom1] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
+                    posq[atom2] = make_real4((real) pos2.x, (real) pos2.y, (real) pos2.z, (real) pos2.w);
+                    posqCorrection[atom1] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
+                    posqCorrection[atom2] = make_real4(pos2.x-(real) pos2.x, pos2.y-(real) pos2.y, pos2.z-(real) pos2.z, 0);
+#else
+                    posq[atom1] = pos1;
+                    posq[atom2] = pos2;
+#endif
+                }
+            }
+            index += GLOBAL_SIZE;
+        }
+    }
+}
+
--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -134,9 +134,9 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CommonApplyAndersenThermostatKernel(name, platform, cu);
    if (name == NoseHooverChainKernel::Name())
-        return new CudaNoseHooverChainKernel(name, platform, cu);
+        return new CommonNoseHooverChainKernel(name, platform, cu);
    if (name == IntegrateVelocityVerletStepKernel::Name())
-        return new CudaIntegrateVelocityVerletStepKernel(name, platform, cu);
+        return new CommonIntegrateVelocityVerletStepKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
    if (name == RemoveCMMotionKernel::Name())

--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -132,9 +132,9 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CommonApplyAndersenThermostatKernel(name, platform, cl);
    if (name == NoseHooverChainKernel::Name())
-        return new OpenCLNoseHooverChainKernel(name, platform, cl);
+        return new CommonNoseHooverChainKernel(name, platform, cl);
    if (name == IntegrateVelocityVerletStepKernel::Name())
-        return new OpenCLIntegrateVelocityVerletStepKernel(name, platform, cl);
+        return new CommonIntegrateVelocityVerletStepKernel(name, platform, cl);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new OpenCLApplyMonteCarloBarostatKernel(name, platform, cl);
    if (name == RemoveCMMotionKernel::Name())