Merge pull request #2527 from andysim/nhcommon

Convert Nose Hoover integrator to Common plaform

Merge pull request #2527 from andysim/nhcommon
Convert Nose Hoover integrator to Common plaform
ca4c03c3 · peastman · GitHub · d169badc · 2270a28a · ca4c03c3
Unverified Commit ca4c03c3 authored Mar 04, 2020 by peastman Committed by GitHub Mar 04, 2020
14 changed files
--- a/openmmapi/include/openmm/NoseHooverIntegrator.h
+++ b/openmmapi/include/openmm/NoseHooverIntegrator.h
@@ -264,6 +264,10 @@ protected:
     * Compute the kinetic energy of the system at the current time.
     */
    virtual double computeKineticEnergy();
+    /**
+     * Computing kinetic energy for this integrator does not require forces.
+     */
+    bool kineticEnergyRequiresForce() const override;
    std::vector<NoseHooverChain> noseHooverChains;
    std::vector<int> allAtoms;

--- a/openmmapi/src/NoseHooverIntegrator.cpp
+++ b/openmmapi/src/NoseHooverIntegrator.cpp
@@ -267,6 +267,7 @@ void NoseHooverIntegrator::setRelativeCollisionFrequency(double frequency, int c
 }
 double NoseHooverIntegrator::computeKineticEnergy() {
+    forcesAreValid = false;
    double kE = 0.0;
    if(noseHooverChains.size() > 0) {
        for (const auto &nhc: noseHooverChains){
@@ -278,6 +279,10 @@ double NoseHooverIntegrator::computeKineticEnergy() {
    return kE;
 }
+bool NoseHooverIntegrator::kineticEnergyRequiresForce() const {
+    return false;
+}
 double NoseHooverIntegrator::computeHeatBathEnergy() {
    double energy = 0;
    for(auto &nhc : noseHooverChains) {
@@ -340,7 +345,8 @@ void NoseHooverIntegrator::step(int steps) {
        throw OpenMMException("This Integrator is not bound to a context!");
    std::pair<double, double> scale, kineticEnergy;
    for (int i = 0; i < steps; ++i) {
-        context->updateContextState();
+        if(context->updateContextState())
+            forcesAreValid = false;
        for(auto &nhc : noseHooverChains) {
            kineticEnergy = nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, false);
            scale = nhcKernel.getAs<NoseHooverChainKernel>().propagateChain(*context, nhc, kineticEnergy, getStepSize());

--- a/platforms/common/include/openmm/common/CommonKernels.h
+++ b/platforms/common/include/openmm/common/CommonKernels.h
@@ -941,6 +941,116 @@ private:
    ComputeKernel kernel1, kernel2, kernel3;
 };
+/*
+ * This kernel is invoked by NoseHooverIntegrator to take one time step.
+ */
+class CommonIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
+public:
+    CommonIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, ComputeContext& cc) :
+                                  IntegrateVelocityVerletStepKernel(name, platform), cc(cc), hasInitializedKernels(false) { }
+    ~CommonIntegrateVelocityVerletStepKernel() {}
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param integrator the NoseHooverIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const NoseHooverIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param integrator the VerletIntegrator this kernel is being used for
+     * @param forcesAreValid a reference to the parent integrator's boolean for keeping
+     *                       track of the validity of the current forces.
+     */
+    void execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid);
+    /**
+     * Compute the kinetic energy.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param integrator the NoseHooverIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
+private:
+    ComputeContext& cc;
+    float prevMaxPairDistance;
+    ComputeArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer;
+    ComputeKernel kernel1, kernel2, kernel3, kernelHardWall;
+    bool hasInitializedKernels;
+};
+/**
+ * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
+ * and update the associated particle velocities.
+ */
+class CommonNoseHooverChainKernel : public NoseHooverChainKernel {
+public:
+    CommonNoseHooverChainKernel(std::string name, const Platform& platform, ComputeContext& cc) :
+                 NoseHooverChainKernel(name, platform), cc(cc), hasInitializedPropagateKernel(false),
+                 hasInitializedKineticEnergyKernel(false), hasInitializedHeatBathEnergyKernel(false),
+                 hasInitializedScaleVelocitiesKernel(false) {}
+    ~CommonNoseHooverChainKernel() {}
+    /**
+     * Initialize the kernel.
+     */
+    void initialize();
+    /**
+     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
+     *
+     * @param context  the context in which to execute this kernel
+     * @param noseHooverChain the object describing the chain to be propagated.
+     * @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
+     * @param timeStep the time step used by the integrator.
+     * @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
+     */
+    std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
+    /**
+     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @return the total heat bath energy.
+     */
+    double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
+    /**
+     * Execute the kernel that computes the kinetic energy for a subset of atoms,
+     * or the relative kinetic energy of Drude particles with respect to their parent atoms
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param downloadValue whether the computed value should be downloaded and returned.
+     *
+     */
+    std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
+    /**
+     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
+     */
+    void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);
+private:
+    int sumWorkGroupSize;
+    ComputeContext& cc;
+    ComputeArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
+    std::map<int, ComputeArray> atomlists, pairlists;
+    std::map<int, ComputeKernel> propagateKernels;
+    bool hasInitializedPropagateKernel;
+    bool hasInitializedKineticEnergyKernel;
+    bool hasInitializedHeatBathEnergyKernel;
+    bool hasInitializedScaleVelocitiesKernel;
+    ComputeKernel reduceEnergyKernel;
+    ComputeKernel computeHeatBathEnergyKernel;
+    ComputeKernel computeAtomsKineticEnergyKernel;
+    ComputeKernel computePairsKineticEnergyKernel;
+    ComputeKernel scaleAtomsVelocitiesKernel;
+    ComputeKernel scalePairsVelocitiesKernel;
+};
 /**
 * This kernel is invoked by BrownianIntegrator to take one time step.
 */

--- a/platforms/common/src/CommonKernels.cpp
+++ b/platforms/common/src/CommonKernels.cpp
--- a/platforms/opencl/src/kernels/noseHooverChain.cl
+++ b/platforms/opencl/src/kernels/noseHooverChain.cl
+KERNEL void propagateNoseHooverChain(GLOBAL mixed2* RESTRICT chainData, GLOBAL const mixed2 * RESTRICT energySum, GLOBAL mixed2* RESTRICT scaleFactor,
-//#include <initializer_list>
+                                     GLOBAL mixed* RESTRICT chainMasses, GLOBAL mixed* RESTRICT chainForces, int chainType, int chainLength, int numMTS,
+                                     int numDOFs, float timeStep, mixed kT, float frequency){
-__kernel void propagateNoseHooverChain(__global mixed2* restrict chainData, __global const mixed2 * restrict energySum, __global mixed2* restrict scaleFactor,
-                                                    __global mixed* restrict chainMasses, __global mixed* restrict chainForces, 
-                                                    int chainType, int chainLength, int numMTS, int numDOFs, float timeStep,
-                                                    mixed kT, float frequency){
    const mixed kineticEnergy = chainType == 0 ? energySum[0].x : energySum[0].y;
    mixed scale = 1;
    if(kineticEnergy < 1e-8) return;
@@ -54,10 +50,9 @@ __kernel void propagateNoseHooverChain(__global mixed2* restrict chainData, __gl
 /**
 * Compute total (potential + kinetic) energy of the Nose-Hoover beads
 */
-__kernel void computeHeatBathEnergy(__global mixed* restrict heatBathEnergy, int chainLength, int numDOFs,
+KERNEL void computeHeatBathEnergy(GLOBAL mixed* RESTRICT heatBathEnergy, int chainLength, int numDOFs,
-                                                 mixed kT, float frequency, __global const mixed2* restrict chainData){
+                                  mixed kT, float frequency, GLOBAL const mixed2* RESTRICT chainData){
    // Note that this is always incremented; make sure it's zeroed properly before the first call
    for(int i = 0; i < chainLength; ++i) {
        mixed prefac = i ? 1 : numDOFs;
        mixed mass = prefac * kT / (frequency * frequency);
@@ -70,25 +65,24 @@ __kernel void computeHeatBathEnergy(__global mixed* restrict heatBathEnergy, int
    }
 }
-__kernel void computeAtomsKineticEnergy(__global mixed2 * restrict energyBuffer, int numAtoms,
+KERNEL void computeAtomsKineticEnergy(GLOBAL mixed2 * RESTRICT energyBuffer, int numAtoms,
-                                        __global const mixed4* restrict velm, __global const int *restrict atoms){
+                                      GLOBAL const mixed4* RESTRICT velm, GLOBAL const int *RESTRICT atoms){
-    mixed2 energy = (mixed2) (0,0);
+    mixed2 energy = make_mixed2(0,0);
-    //energy = 1; return;
+    int index = GLOBAL_ID;
-    int index = get_global_id(0);
    while (index < numAtoms){
        int atom = atoms[index];
        mixed4 v = velm[atom];
        mixed mass = v.w == 0 ? 0 : 1 / v.w;
        energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
-        index += get_global_size(0);
+        index += GLOBAL_SIZE;
    }
-    energyBuffer[get_global_id(0)] = energy;
+    energyBuffer[GLOBAL_ID] = energy;
 }
-__kernel void computePairsKineticEnergy(__global mixed2 * restrict energyBuffer, int numPairs,
+KERNEL void computePairsKineticEnergy(GLOBAL mixed2 * RESTRICT energyBuffer, int numPairs,
-                                        __global const mixed4* restrict velm, __global const int2 *restrict pairs){
+                                      GLOBAL const mixed4* RESTRICT velm, GLOBAL const int2 *RESTRICT pairs){
-    mixed2 energy = (mixed2) (0,0);
+    mixed2 energy = make_mixed2(0,0);
-    int index = get_global_id(0);
+    int index = GLOBAL_ID;
    while (index < numPairs){
        int2 pair = pairs[index];
        int atom1 = pair.x;
@@ -107,64 +101,74 @@ __kernel void computePairsKineticEnergy(__global mixed2 * restrict energyBuffer,
        rv.z = v2.z - v1.z;
        energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
        energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
-        index += get_global_size(0);
+        index += GLOBAL_SIZE;
    }
    // The atoms version of this has been called already, so accumulate instead of assigning here
-    energyBuffer[get_global_id(0)].xy += energy.xy;
+    energyBuffer[GLOBAL_ID].x += energy.x;
+    energyBuffer[GLOBAL_ID].y += energy.y;
 }
-__kernel void scaleAtomsVelocities(__global mixed2* restrict scaleFactor, int numAtoms,
+KERNEL void scaleAtomsVelocities(GLOBAL mixed2* RESTRICT scaleFactor, int numAtoms,
-                                   __global mixed4* restrict velm, __global const int *restrict atoms){
+                                   GLOBAL mixed4* RESTRICT velm, GLOBAL const int *RESTRICT atoms){
    const mixed scale = scaleFactor[0].x;
-    int index = get_global_id(0);
+    int index = GLOBAL_ID;
    while (index < numAtoms){
        int atom = atoms[index];
        velm[atom].x *= scale;
        velm[atom].y *= scale;
        velm[atom].z *= scale;
-        index += get_global_size(0);
+        index += GLOBAL_SIZE;
    }
 }
-__kernel void scalePairsVelocities(__global mixed2 * restrict scaleFactor, int numPairs,
+KERNEL void scalePairsVelocities(GLOBAL mixed2 * RESTRICT scaleFactor, int numPairs,
-                                   __global mixed4* restrict velm, __global const int2 *restrict pairs){
+                                 GLOBAL mixed4* RESTRICT velm, GLOBAL const int2 *RESTRICT pairs){
-    int index = get_global_id(0);
+    int index = GLOBAL_ID;
+    mixed comScale = scaleFactor[0].x;
+    mixed relScale = scaleFactor[0].y;
    while (index < numPairs){
        int atom1 = pairs[index].x;
        int atom2 = pairs[index].y;
        mixed m1 = velm[atom1].w == 0 ? 0 : 1 / velm[atom1].w;
        mixed m2 = velm[atom2].w == 0 ? 0 : 1 / velm[atom2].w;
        mixed4 cv;
-        cv.xyz = (m1*velm[atom1].xyz + m2*velm[atom2].xyz) / (m1 + m2);
+        cv.x = (m1*velm[atom1].x + m2*velm[atom2].x) / (m1 + m2);
+        cv.y = (m1*velm[atom1].y + m2*velm[atom2].y) / (m1 + m2);
+        cv.z = (m1*velm[atom1].z + m2*velm[atom2].z) / (m1 + m2);
        mixed4 rv;
-        rv.xyz = velm[atom2].xyz - velm[atom1].xyz;
+        rv.x = velm[atom2].x - velm[atom1].x;
-        velm[atom1].x = scaleFactor[0].x * cv.x - scaleFactor[0].y * rv.x * m2 / (m1 + m2);
+        rv.y = velm[atom2].y - velm[atom1].y;
-        velm[atom1].y = scaleFactor[0].x * cv.y - scaleFactor[0].y * rv.y * m2 / (m1 + m2);
+        rv.z = velm[atom2].z - velm[atom1].z;
-        velm[atom1].z = scaleFactor[0].x * cv.z - scaleFactor[0].y * rv.z * m2 / (m1 + m2);
+        velm[atom1].x = comScale * cv.x - relScale * rv.x * m2 / (m1 + m2);
-        velm[atom2].x = scaleFactor[0].x * cv.x + scaleFactor[0].y * rv.x * m1 / (m1 + m2);
+        velm[atom1].y = comScale * cv.y - relScale * rv.y * m2 / (m1 + m2);
-        velm[atom2].y = scaleFactor[0].x * cv.y + scaleFactor[0].y * rv.y * m1 / (m1 + m2);
+        velm[atom1].z = comScale * cv.z - relScale * rv.z * m2 / (m1 + m2);
-        velm[atom2].z = scaleFactor[0].x * cv.z + scaleFactor[0].y * rv.z * m1 / (m1 + m2);
+        velm[atom2].x = comScale * cv.x + relScale * rv.x * m1 / (m1 + m2);
-        index += get_global_size(0);
+        velm[atom2].y = comScale * cv.y + relScale * rv.y * m1 / (m1 + m2);
+        velm[atom2].z = comScale * cv.z + relScale * rv.z * m1 / (m1 + m2);
+        index += GLOBAL_SIZE;
    }
 }
 /**
- * Sum the energy buffer containing a pair of energies stored as mixed2.  This is copied from utilities.cu with small modifications
+ * Sum the energy buffer containing a pair of energies stored as mixed2.  This is taken from the analogous customIntegrator code
 */
-__kernel void reduceEnergyPair(__global const mixed2* restrict energyBuffer, __global mixed2* restrict result, int bufferSize, int workGroupSize, __local mixed2* restrict tempBuffer) {
+KERNEL void reduceEnergyPair(GLOBAL const mixed2* RESTRICT sumBuffer, GLOBAL mixed2* result, int bufferSize) {
-    const unsigned int thread = get_local_id(0);
+    LOCAL mixed2 tempBuffer[WORK_GROUP_SIZE];
-    mixed2 sum = (mixed2) (0,0);
+    const unsigned int thread = LOCAL_ID;
-    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0)) {
+    mixed2 sum = make_mixed2(0,0);
-        sum.xy += energyBuffer[index].xy;
+    for (unsigned int index = thread; index < bufferSize; index += LOCAL_SIZE) {
+        sum.x += sumBuffer[index].x;
+        sum.y += sumBuffer[index].y;
    }
-    tempBuffer[thread].xy = sum.xy;
+    tempBuffer[thread].x = sum.x;
-    for (int i = 1; i < workGroupSize; i *= 2) {
+    tempBuffer[thread].y = sum.y;
-        barrier(CLK_LOCAL_MEM_FENCE);
+    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
-        if (thread%(i*2) == 0 && thread+i < workGroupSize) {
+        SYNC_THREADS;
-            tempBuffer[thread].xy += tempBuffer[thread+i].xy;
+        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE) {
+            tempBuffer[thread].x += tempBuffer[thread+i].x;
+            tempBuffer[thread].y += tempBuffer[thread+i].y;
        }
    }
-    if (thread == 0) {
+    if (thread == 0)
        *result = tempBuffer[0];
-    }
 }
--- a/platforms/cuda/src/kernels/velocityVerlet.cu
+++ b/platforms/cuda/src/kernels/velocityVerlet.cu
 /**
 * Perform the first step of Velocity Verlet integration.
- * 
- * update displacements (posDelta) and velocities (velm)
 */
-extern "C" __global__ void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
+KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL const mixed2* RESTRICT dt, GLOBAL const real4* RESTRICT posq,
-        const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
+                                         GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta,
-        const int* __restrict__ atomList, const int2* __restrict__ pairList) {
+                                         GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList
+#ifdef USE_MIXED_PRECISION
+                                        ,GLOBAL const real4* RESTRICT posqCorrection
+#endif
+    ){
    const mixed2 stepSize = dt[0];
    const mixed dtPos = stepSize.y;
    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
-    const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
+    const mixed scale = 0.5f * dtVel/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+    int index = GLOBAL_ID;
+    while (index < numAtoms) {
        int atom = atomList[index];
        mixed4 velocity = velm[atom];
        if (velocity.w != 0.0) {
@@ -31,8 +34,10 @@ extern "C" __global__ void integrateVelocityVerletPart1(int numAtoms, int numPai
            posDelta[atom] = pos;
            velm[atom] = velocity;
        }
+        index += GLOBAL_SIZE;
    }
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+    index = GLOBAL_ID;
+    while (index < numPairs){
        int atom1 = pairList[index].x;
        int atom2 = pairList[index].y;
        mixed4 v1 = velm[atom1];
@@ -51,21 +56,27 @@ extern "C" __global__ void integrateVelocityVerletPart1(int numAtoms, int numPai
        relVel.x= v2.x - v1.x;
        relVel.y= v2.y - v1.y;
        relVel.z= v2.z - v1.z;
-        //
        mixed3 comFrc;
-        comFrc.x = force[atom1] + force[atom2];
+        mixed F1x = scale*force[atom1];
-        comFrc.y = force[atom1 + paddedNumAtoms] + force[atom2 + paddedNumAtoms];
+        mixed F1y = scale*force[atom1+paddedNumAtoms];
-        comFrc.z = force[atom1 + paddedNumAtoms*2] + force[atom2 + paddedNumAtoms*2];
+        mixed F1z = scale*force[atom1+paddedNumAtoms*2];
+        mixed F2x = scale*force[atom2];
+        mixed F2y = scale*force[atom2+paddedNumAtoms];
+        mixed F2z = scale*force[atom2+paddedNumAtoms*2];
+        comFrc.x = F1x + F2x;
+        comFrc.y = F1y + F2y;
+        comFrc.z = F1z + F2z;
        mixed3 relFrc;
-        relFrc.x = mass1fract*force[atom2] - mass2fract*force[atom1];
+        relFrc.x = mass1fract*F2x - mass2fract*F1x;
-        relFrc.y = mass1fract*force[atom2+paddedNumAtoms] - mass2fract*force[atom1+paddedNumAtoms];
+        relFrc.y = mass1fract*F2y - mass2fract*F1y;
-        relFrc.z = mass1fract*force[atom2+paddedNumAtoms*2] - mass2fract*force[atom1+paddedNumAtoms*2];
+        relFrc.z = mass1fract*F2z - mass2fract*F1z;
-        comVel.x += comFrc.x * scale * invTotMass;
+        comVel.x += comFrc.x * invTotMass;
-        comVel.y += comFrc.y * scale * invTotMass;
+        comVel.y += comFrc.y * invTotMass;
-        comVel.z += comFrc.z * scale * invTotMass;
+        comVel.z += comFrc.z * invTotMass;
-        relVel.x += relFrc.x * scale * invRedMass;
+        relVel.x += relFrc.x * invRedMass;
-        relVel.y += relFrc.y * scale * invRedMass;
+        relVel.y += relFrc.y * invRedMass;
-        relVel.z += relFrc.z * scale * invRedMass;
+        relVel.z += relFrc.z * invRedMass;
 #ifdef USE_MIXED_PRECISION
        real4 posv1 = posq[atom1];
        real4 posv2 = posq[atom2];
@@ -97,22 +108,25 @@ extern "C" __global__ void integrateVelocityVerletPart1(int numAtoms, int numPai
            posDelta[atom2] = pos2;
            velm[atom2] = v2;
        }
-    }
+        index += GLOBAL_SIZE;
+     }
 }
 /**
 * Perform the second step of Velocity Verlet integration.
- * 
- * apply displacements to positions (posq) after constraints have been enforced
 */
-extern "C" __global__ void integrateVelocityVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
+KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm,
-        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
+                                         GLOBAL const mixed4* RESTRICT posDelta
+#ifdef USE_MIXED_PRECISION
+                                        ,GLOBAL real4* RESTRICT posqCorrection
+#endif
+    ){
    mixed2 stepSize = dt[0];
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int index = GLOBAL_ID;
    if (index == 0)
        dt[0].x = stepSize.y;
-    for (; index < numAtoms; index += blockDim.x*gridDim.x) {
+    while(index < numAtoms) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
 #ifdef USE_MIXED_PRECISION
@@ -133,21 +147,23 @@ extern "C" __global__ void integrateVelocityVerletPart2(int numAtoms, mixed2* __
            posq[index] = pos;
 #endif
        }
+        index += GLOBAL_SIZE;
    }
 }
 /**
 * Perform the third step of Velocity Verlet integration.
- *
- * modify the velocities (velm) after the force update
 */
-extern "C" __global__ void integrateVelocityVerletPart3(int numAtoms, int numPairs, int paddedNumAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
+KERNEL void integrateVelocityVerletPart3(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
-        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm,  const long long* __restrict__ force, const mixed4* __restrict__ posDelta,
+                                         GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed4* RESTRICT posDelta,
-        const int* __restrict__ atomList, const int2* __restrict__ pairList) {
+                                         GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList
+#ifdef USE_MIXED_PRECISION
+                                        ,GLOBAL const real4* RESTRICT posqCorrection
+#endif
+    ){
    mixed2 stepSize = dt[0];
-#if __CUDA_ARCH__ >= 130
+#ifdef SUPPORTS_DOUBLE_PRECISION
    double oneOverDt = 1.0/stepSize.y;
 #else
    float oneOverDt = 1.0f/stepSize.y;
@@ -155,11 +171,10 @@ extern "C" __global__ void integrateVelocityVerletPart3(int numAtoms, int numPai
 #endif
    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
    const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int index = GLOBAL_ID;
    if (index == 0)
        dt[0].x = stepSize.y;
+    while(index < numAtoms) {
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        int atom = atomList[index];
        mixed4 velocity = velm[atom];
        if (velocity.w != 0.0) {
@@ -167,15 +182,17 @@ extern "C" __global__ void integrateVelocityVerletPart3(int numAtoms, int numPai
            velocity.x += scale*force[atom]*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
            velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
            velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
-#if __CUDA_ARCH__ < 130
+#ifndef SUPPORTS_DOUBLE_PRECISION
            velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
            velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
            velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
 #endif
            velm[atom] = velocity;
        }
+        index += GLOBAL_SIZE;
    }
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+    index = GLOBAL_ID;
+    while(index < numPairs) {
        int atom1 = pairList[index].x;
        int atom2 = pairList[index].y;
        mixed4 v1 = velm[atom1];
@@ -194,27 +211,33 @@ extern "C" __global__ void integrateVelocityVerletPart3(int numAtoms, int numPai
        relVel.x= v2.x - v1.x;
        relVel.y= v2.y - v1.y;
        relVel.z= v2.z - v1.z;
-        //
        mixed3 comFrc;
-        comFrc.x = force[atom1] + force[atom2];
+        mixed F1x = scale*force[atom1];
-        comFrc.y = force[atom1 + paddedNumAtoms] + force[atom2 + paddedNumAtoms];
+        mixed F1y = scale*force[atom1+paddedNumAtoms];
-        comFrc.z = force[atom1 + paddedNumAtoms*2] + force[atom2 + paddedNumAtoms*2];
+        mixed F1z = scale*force[atom1+paddedNumAtoms*2];
+        mixed F2x = scale*force[atom2];
+        mixed F2y = scale*force[atom2+paddedNumAtoms];
+        mixed F2z = scale*force[atom2+paddedNumAtoms*2];
+        comFrc.x = F1x + F2x;
+        comFrc.y = F1y + F2y;
+        comFrc.z = F1z + F2z;
        mixed3 relFrc;
-        relFrc.x = mass1fract*force[atom2] - mass2fract*force[atom1];
+        relFrc.x = mass1fract*F2x - mass2fract*F1x;
-        relFrc.y = mass1fract*force[atom2+paddedNumAtoms] - mass2fract*force[atom1+paddedNumAtoms];
+        relFrc.y = mass1fract*F2y - mass2fract*F1y;
-        relFrc.z = mass1fract*force[atom2+paddedNumAtoms*2] - mass2fract*force[atom1+paddedNumAtoms*2];
+        relFrc.z = mass1fract*F2z - mass2fract*F1z;
-        comVel.x += comFrc.x * scale * invTotMass;
+        comVel.x += comFrc.x * invTotMass;
-        comVel.y += comFrc.y * scale * invTotMass;
+        comVel.y += comFrc.y * invTotMass;
-        comVel.z += comFrc.z * scale * invTotMass;
+        comVel.z += comFrc.z * invTotMass;
-        relVel.x += relFrc.x * scale * invRedMass;
+        relVel.x += relFrc.x * invRedMass;
-        relVel.y += relFrc.y * scale * invRedMass;
+        relVel.y += relFrc.y * invRedMass;
-        relVel.z += relFrc.z * scale * invRedMass;
+        relVel.z += relFrc.z * invRedMass;
        if (v1.w != 0.0f) {
            mixed4 deltaXconstrained = posDelta[atom1];
            v1.x = comVel.x - relVel.x*mass2fract + (deltaXconstrained.x - v1.x*stepSize.y)*oneOverDt;
            v1.y = comVel.y - relVel.y*mass2fract + (deltaXconstrained.y - v1.y*stepSize.y)*oneOverDt;
            v1.z = comVel.z - relVel.z*mass2fract + (deltaXconstrained.z - v1.z*stepSize.y)*oneOverDt;
-#if __CUDA_ARCH__ < 130
+#ifndef SUPPORTS_DOUBLE_PRECISION
            v1.x += (deltaXconstrained.x - v1.x*stepSize.y)*correction;
            v1.y += (deltaXconstrained.y - v1.y*stepSize.y)*correction;
            v1.z += (deltaXconstrained.z - v1.z*stepSize.y)*correction;
@@ -226,58 +249,57 @@ extern "C" __global__ void integrateVelocityVerletPart3(int numAtoms, int numPai
            v2.x = comVel.x + relVel.x*mass1fract + (deltaXconstrained.x - v2.x*stepSize.y)*oneOverDt;
            v2.y = comVel.y + relVel.y*mass1fract + (deltaXconstrained.y - v2.y*stepSize.y)*oneOverDt;
            v2.z = comVel.z + relVel.z*mass1fract + (deltaXconstrained.z - v2.z*stepSize.y)*oneOverDt;
-#if __CUDA_ARCH__ < 130
+#ifndef SUPPORTS_DOUBLE_PRECISION
            v2.x += (deltaXconstrained.x - v2.x*stepSize.y)*correction;
            v2.y += (deltaXconstrained.y - v2.y*stepSize.y)*correction;
            v2.z += (deltaXconstrained.z - v2.z*stepSize.y)*correction;
 #endif
            velm[atom2] = v2;
        }
+        index += GLOBAL_SIZE;
    }
 }
+KERNEL void integrateVelocityVerletHardWall(int numPairs, GLOBAL const float* RESTRICT maxPairDistance, 
+                                            GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
+                                            GLOBAL mixed4* RESTRICT velm, GLOBAL const int2* RESTRICT pairList,
+                                            GLOBAL const float* RESTRICT pairTemperature
+#ifdef USE_MIXED_PRECISION
+                                           ,GLOBAL real4* RESTRICT posqCorrection
+#endif
+    ){
-/**
- * Apply the hard wall constraint 
- */
-extern "C" __global__ void integrateVelocityVerletHardWall(int numPairs, const float* __restrict__ maxPairDistance, mixed2* __restrict__ dt, real4* __restrict__ posq,
-        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, 
-        const int2* __restrict__ pairList, const float* __restrict__ pairTemperature) {
    mixed dtPos = dt[0].y;
    mixed maxDelta = (mixed) maxPairDistance[0];
-    // Apply hard wall constraints.
+    if (maxDelta > 0){
-    if (maxDelta > 0) {
+        int index = GLOBAL_ID;
-        for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+        while(index < numPairs) {
            const mixed hardWallScale = sqrt( ((mixed) pairTemperature[index]) * ((mixed) BOLTZ));
-            int2 atom = make_int2(pairList[index].x, pairList[index].y);
+            int atom1 = pairList[index].x;
+            int atom2 = pairList[index].y;
 #ifdef USE_MIXED_PRECISION
-            real4 posv1 = posq[atom.x];
+            real4 posv1 = posq[atom1];
-            real4 posc1 = posqCorrection[atom.x];
+            real4 posc1 = posqCorrection[atom1];
            mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
-            real4 posv2 = posq[atom.y];
+            real4 posv2 = posq[atom2];
-            real4 posc2 = posqCorrection[atom.y];
+            real4 posc2 = posqCorrection[atom2];
            mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
 #else
-            real4 pos1 = posq[atom.x];
+            real4 pos1 = posq[atom1];
-            real4 pos2 = posq[atom.y];
+            real4 pos2 = posq[atom2];
 #endif
-            mixed3 delta = make_mixed3(
+            mixed3 delta = make_mixed3(pos1.x - pos2.x, pos1.y - pos2.y, pos1.z - pos2.z);
-                mixed (pos1.x - pos2.x),
-                mixed (pos1.y - pos2.y),
-                mixed (pos1.z - pos2.z)
-            );
            mixed r = sqrt(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
            mixed rInv = 1/r;
            if (rInv*maxDelta < 1.0) {
                // The constraint has been violated, so make the inter-particle distance "bounce"
                // off the hard wall.
                mixed3 bondDir = make_mixed3(delta.x * rInv, delta.y * rInv, delta.z * rInv);
-                mixed3 vel1 = make_mixed3(velm[atom.x].x, velm[atom.x].y, velm[atom.x].z);
+                mixed3 vel1 = make_mixed3(velm[atom1].x, velm[atom1].y, velm[atom1].z);
-                mixed3 vel2 = make_mixed3(velm[atom.y].x, velm[atom.y].y, velm[atom.y].z);
+                mixed3 vel2 = make_mixed3(velm[atom2].x, velm[atom2].y, velm[atom2].z);
-                mixed m1 = velm[atom.x].w != 0.0 ? 1.0/velm[atom.x].w : 0.0;
+                mixed m1 = velm[atom1].w != 0.0 ? 1.0/velm[atom1].w : 0.0;
-                mixed m2 = velm[atom.y].w != 0.0 ? 1.0/velm[atom.y].w : 0.0;
+                mixed m2 = velm[atom2].w != 0.0 ? 1.0/velm[atom2].w : 0.0;
                mixed invTotMass = (m1 + m2 != 0.0) ? 1.0 /(m1 + m2) : 0.0;
                mixed deltaR = r-maxDelta;
                mixed deltaT = dtPos;
@@ -298,12 +320,12 @@ extern "C" __global__ void integrateVelocityVerletHardWall(int numPairs, const f
                    pos1.x += bondDir.x*dr;
                    pos1.y += bondDir.y*dr;
                    pos1.z += bondDir.z*dr;
-                    velm[atom.x] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom.x].w);
+                    velm[atom1] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom1].w);
 #ifdef USE_MIXED_PRECISION
-                    posq[atom.x] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
+                    posq[atom1] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
-                    posqCorrection[atom.x] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
+                    posqCorrection[atom1] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
 #else
-                    posq[atom.x] = pos1;
+                    posq[atom1] = pos1;
 #endif
                }
                else {
@@ -331,19 +353,21 @@ extern "C" __global__ void integrateVelocityVerletHardWall(int numPairs, const f
                    pos2.x += bondDir.x*dr2;
                    pos2.y += bondDir.y*dr2;
                    pos2.z += bondDir.z*dr2;
-                    velm[atom.x] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom.x].w);
+                    velm[atom1] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom1].w);
-                    velm[atom.y] = make_mixed4(vp2.x + bondDir.x*dotvr2, vp2.y + bondDir.y*dotvr2, vp2.z + bondDir.z*dotvr2, velm[atom.y].w);
+                    velm[atom2] = make_mixed4(vp2.x + bondDir.x*dotvr2, vp2.y + bondDir.y*dotvr2, vp2.z + bondDir.z*dotvr2, velm[atom2].w);
 #ifdef USE_MIXED_PRECISION
-                    posq[atom.x] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
+                    posq[atom1] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
-                    posq[atom.y] = make_real4((real) pos2.x, (real) pos2.y, (real) pos2.z, (real) pos2.w);
+                    posq[atom2] = make_real4((real) pos2.x, (real) pos2.y, (real) pos2.z, (real) pos2.w);
-                    posqCorrection[atom.x] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
+                    posqCorrection[atom1] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
-                    posqCorrection[atom.y] = make_real4(pos2.x-(real) pos2.x, pos2.y-(real) pos2.y, pos2.z-(real) pos2.z, 0);
+                    posqCorrection[atom2] = make_real4(pos2.x-(real) pos2.x, pos2.y-(real) pos2.y, pos2.z-(real) pos2.z, 0);
 #else
-                    posq[atom.x] = pos1;
+                    posq[atom1] = pos1;
-                    posq[atom.y] = pos2;
+                    posq[atom2] = pos2;
 #endif
                }
            }
+            index += GLOBAL_SIZE;
        }
-    } /* end of hard wall constraint part */
+    }
 }
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -447,109 +447,6 @@ private:
    CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
 };
-/*
- * This kernel is invoked by NoseHooverIntegrator to take one time step.
- */
-class CudaIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
-public:
-    CudaIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) :
-                                  IntegrateVelocityVerletStepKernel(name, platform), cu(cu) { }
-    ~CudaIntegrateVelocityVerletStepKernel() {}
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param integrator the NoseHooverIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const NoseHooverIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VerletIntegrator this kernel is being used for
-     * @param forcesAreValid a reference to the parent integrator's boolean for keeping
-     *                       track of the validity of the current forces.
-     */
-    void execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid);
-    /**
-     * Compute the kinetic energy.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the NoseHooverIntegrator this kernel is being used for
-     */
-    double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
-private:
-    CudaContext& cu;
-    float prevMaxPairDistance;
-    CudaArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer;
-    CUfunction kernel1, kernel2, kernel3, kernelHardWall;
-};
-/**
- * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
- * and update the associated particle velocities.
- */
-class CudaNoseHooverChainKernel : public NoseHooverChainKernel {
-public:
-    CudaNoseHooverChainKernel(std::string name, const Platform& platform, CudaContext& cu) : NoseHooverChainKernel(name, platform), cu(cu) {
-    }
-    ~CudaNoseHooverChainKernel() {}
-    /**
-     * Initialize the kernel.
-     */
-    void initialize();
-    /**
-     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
-     * 
-     * @param context  the context in which to execute this kernel
-     * @param noseHooverChain the object describing the chain to be propagated.
-     * @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
-     * @param timeStep the time step used by the integrator.
-     * @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
-     */
-    std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
-    /**
-     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @return the total heat bath energy.
-     */
-    double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
-    /**
-     * Execute the kernel that computes the kinetic energy for a subset of atoms,
-     * or the relative kinetic energy of Drude particles with respect to their parent atoms
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @param downloadValue whether the computed value should be downloaded and returned.
-     *
-     */
-    std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
-    /**
-     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
-     */
-    void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);
-private:
-    int sumWorkGroupSize;
-    CudaContext& cu;
-    CudaArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
-    std::map<int, CudaArray> atomlists, pairlists;
-    std::map<int, CUfunction> propagateKernels;
-    CUfunction reduceEnergyKernel;
-    CUfunction computeHeatBathEnergyKernel;
-    CUfunction computeAtomsKineticEnergyKernel;
-    CUfunction computePairsKineticEnergyKernel;
-    CUfunction scaleAtomsVelocitiesKernel;
-    CUfunction scalePairsVelocitiesKernel;
-};
 /**
 * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
 */

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -134,9 +134,9 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CommonApplyAndersenThermostatKernel(name, platform, cu);
    if (name == NoseHooverChainKernel::Name())
-        return new CudaNoseHooverChainKernel(name, platform, cu);
+        return new CommonNoseHooverChainKernel(name, platform, cu);
    if (name == IntegrateVelocityVerletStepKernel::Name())
-        return new CudaIntegrateVelocityVerletStepKernel(name, platform, cu);
+        return new CommonIntegrateVelocityVerletStepKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
    if (name == RemoveCMMotionKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/kernels/noseHooverChain.cu
+++ b/platforms/cuda/src/kernels/noseHooverChain.cu
-#include <initializer_list>
-extern "C" __global__ void propagateNoseHooverChain(mixed2* __restrict__ chainData, const mixed2 * __restrict__ energySum, mixed2* __restrict__ scaleFactor,
-                                                    mixed* __restrict__ chainMasses, mixed* __restrict__ chainForces, 
-                                                    int chainType, int chainLength, int numMTS, int numDOFs, float timeStep,
-                                                    mixed kT, float frequency){
-    const mixed & kineticEnergy = chainType ? energySum[0].y : energySum[0].x;
-    mixed &scale = chainType ? scaleFactor[0].y : scaleFactor[0].x;
-    scale = (mixed) 1;
-    if(kineticEnergy < 1e-8) return;
-    for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
-    chainMasses[0] *= numDOFs;
-    mixed KE2 = 2.0f * kineticEnergy;
-    mixed timeOverMTS = timeStep / numMTS;
-    chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
-    for (int bead = 0; bead < chainLength - 1; ++bead) {
-        chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
-    }
-    for (int mts = 0; mts < numMTS; ++mts) {
-        BEGIN_YS_LOOP
-            mixed wdt = ys * timeOverMTS;
-            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
-            for (int bead = chainLength - 2; bead >= 0; --bead) {
-                mixed aa = MIXEDEXP(-0.125f * wdt * chainData[bead + 1].y);
-                chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
-            }
-            // update particle velocities
-            mixed aa = MIXEDEXP(-0.5f * wdt * chainData[0].y);
-            scale *= aa;
-            // update the thermostat positions
-            for (int bead = 0; bead < chainLength; ++bead) {
-                chainData[bead].x += 0.5f * chainData[bead].y * wdt;
-            }
-            // update the forces
-            chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
-            // update thermostat velocities
-            for (int bead = 0; bead < chainLength - 1; ++bead) {
-                mixed aa = MIXEDEXP(-0.125f * wdt * chainData[bead + 1].y);
-                chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
-                chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
-            }
-            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
-        END_YS_LOOP
-    } // MTS loop
-}
-/**
- * Compute total (potential + kinetic) energy of the Nose-Hoover beads
- */
-extern "C" __global__ void computeHeatBathEnergy(mixed* __restrict__ heatBathEnergy, int chainLength, int numDOFs,
-                                                 mixed kT, float frequency, const mixed2* __restrict__ chainData){
-    // Note that this is always incremented; make sure it's zeroed properly before the first call
-    mixed &energy = heatBathEnergy[0];
-    for(int i = 0; i < chainLength; ++i) {
-        mixed prefac = i ? 1 : numDOFs;
-        mixed mass = prefac * kT / (frequency * frequency);
-        mixed velocity = chainData[i].y; 
-        // The kinetic energy of this bead
-        energy += 0.5f * mass * velocity * velocity;
-        // The potential energy of this bead
-        mixed position = chainData[i].x;
-        energy += prefac * kT * position;
-    }
-}
-extern "C" __global__ void computeAtomsKineticEnergy(mixed2 * __restrict__ energyBuffer, int numAtoms,
-                                                     const mixed4* __restrict__ velm, const int *__restrict__ atoms){
-    mixed2 energy = make_mixed2(0,0);
-    //energy = 1; return;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        int atom = atoms[index];
-        mixed4 v = velm[atom];
-        mixed mass = v.w == 0 ? 0 : 1 / v.w;
-        energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = energy;
-}
-extern "C" __global__ void computePairsKineticEnergy(mixed2 * __restrict__ energyBuffer, int numPairs,
-                                                     const mixed4* __restrict__ velm, const int2 *__restrict__ pairs){
-    mixed2 energy = make_mixed2(0,0);
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
-        int2 pair = pairs[index];
-        int atom1 = pair.x;
-        int atom2 = pair.y;
-        mixed4 v1 = velm[atom1];
-        mixed4 v2 = velm[atom2];
-        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
-        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
-        mixed4 cv;
-        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
-        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
-        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
-        mixed4 rv;
-        rv.x = v2.x - v1.x;
-        rv.y = v2.y - v1.y;
-        rv.z = v2.z - v1.z;
-        energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
-        energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
-    }
-    // The atoms version of this has been called already, so accumulate instead of assigning here
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x].x += energy.x;
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x].y += energy.y;
-}
-extern "C" __global__ void scaleAtomsVelocities(mixed2* __restrict__ scaleFactor, int numAtoms,
-                                                mixed4* __restrict__ velm, const int *__restrict__ atoms){
-    const mixed &scale = scaleFactor[0].x;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        int atom = atoms[index];
-        mixed4 &v = velm[atom];
-        v.x *= scale;
-        v.y *= scale;
-        v.z *= scale;
-    }
-}
-extern "C" __global__ void scalePairsVelocities(mixed2 * __restrict__ scaleFactor, int numPairs,
-                                                mixed4* __restrict__ velm, const int2 *__restrict__ pairs){
-    const mixed &absScale = scaleFactor[0].x;
-    const mixed &relScale = scaleFactor[0].y;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
-        int atom1 = pairs[index].x;
-        int atom2 = pairs[index].y;
-        mixed4 v1 = velm[atom1];
-        mixed4 v2 = velm[atom2];
-        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
-        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
-        mixed4 cv;
-        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
-        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
-        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
-        mixed4 rv;
-        rv.x = v2.x - v1.x;
-        rv.y = v2.y - v1.y;
-        rv.z = v2.z - v1.z;
-        v1.x = absScale * cv.x - relScale * rv.x * m2 / (m1 + m2);
-        v1.y = absScale * cv.y - relScale * rv.y * m2 / (m1 + m2);
-        v1.z = absScale * cv.z - relScale * rv.z * m2 / (m1 + m2);
-        v2.x = absScale * cv.x + relScale * rv.x * m1 / (m1 + m2);
-        v2.y = absScale * cv.y + relScale * rv.y * m1 / (m1 + m2);
-        v2.z = absScale * cv.z + relScale * rv.z * m1 / (m1 + m2);
-        velm[atom1] = v1;
-        velm[atom2] = v2;
-    }
-}
-/**
- * Sum the energy buffer containing a pair of energies stored as mixed2.  This is copied from utilities.cu with small modifications
- */
-extern "C" __global__ void reduceEnergyPair(const mixed2* __restrict__ energyBuffer, mixed2* __restrict__ result, int bufferSize, int workGroupSize) {
-    __shared__ mixed2 tempBuffer[WORK_GROUP_SIZE];
-    const unsigned int thread = threadIdx.x;
-    mixed2 sum = make_mixed2(0,0);
-    for (unsigned int idx = thread; idx < bufferSize; idx += blockDim.x) {
-        sum.x += energyBuffer[idx].x;
-        sum.y += energyBuffer[idx].y;
-    }
-    tempBuffer[thread] = sum;
-    for (int i = 1; i < workGroupSize; i *= 2) {
-        __syncthreads();
-        if (thread%(i*2) == 0 && thread+i < workGroupSize) {
-            tempBuffer[thread].x += tempBuffer[thread+i].x;
-            tempBuffer[thread].y += tempBuffer[thread+i].y;
-        }
-    }
-    if (thread == 0)
-        *result = tempBuffer[0];
-}
--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -428,109 +428,6 @@ private:
    cl::Kernel copyStateKernel, copyForcesKernel, addForcesKernel;
 };
-/*
- * This kernel is invoked by NoseHooverIntegrator to take one time step.
- */
-class OpenCLIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
-public:
-    OpenCLIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) :
-                                  IntegrateVelocityVerletStepKernel(name, platform), cl(cl) { }
-    ~OpenCLIntegrateVelocityVerletStepKernel() {}
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param integrator the NoseHooverIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const NoseHooverIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VerletIntegrator this kernel is being used for
-     * @param forcesAreValid a reference to the parent integrator's boolean for keeping
-     *                       track of the validity of the current forces.
-     */
-    void execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid);
-    /**
-     * Compute the kinetic energy.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the NoseHooverIntegrator this kernel is being used for
-     */
-    double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
-private:
-    OpenCLContext& cl;
-    float prevMaxPairDistance;
-    OpenCLArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer; 
-    cl::Kernel kernel1, kernel2, kernel3, kernelHardWall;
-};
-/**
- * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
- * and update the associated particle velocities.
- */
-class OpenCLNoseHooverChainKernel : public NoseHooverChainKernel {
-public:
-    OpenCLNoseHooverChainKernel(std::string name, const Platform& platform, OpenCLContext& cl) : NoseHooverChainKernel(name, platform), cl(cl) {
-    }
-    ~OpenCLNoseHooverChainKernel() {}
-    /**
-     * Initialize the kernel.
-     */
-    void initialize();
-    /**
-     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
-     * 
-     * @param context  the context in which to execute this kernel
-     * @param noseHooverChain the object describing the chain to be propagated.
-     * @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
-     * @param timeStep the time step used by the integrator.
-     * @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
-     */
-    std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
-    /**
-     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @return the total heat bath energy.
-     */
-    double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
-    /**
-     * Execute the kernel that computes the kinetic energy for a subset of atoms,
-     * or the relative kinetic energy of Drude particles with respect to their parent atoms
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @param downloadValue whether the computed value should be downloaded and returned.
-     *
-     */
-    std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
-    /**
-     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
-     *
-     * @param context the context in which to execute this kernel
-     * @param noseHooverChain the chain whose energy is to be determined.
-     * @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
-     */
-    void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);
-private:
-    int sumWorkGroupSize;
-    OpenCLContext& cl;
-    OpenCLArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
-    std::map<int, OpenCLArray> atomlists, pairlists;
-    std::map<int, cl::Kernel> propagateKernels;
-    cl::Kernel reduceEnergyKernel;
-    cl::Kernel computeHeatBathEnergyKernel;
-    cl::Kernel computeAtomsKineticEnergyKernel;
-    cl::Kernel computePairsKineticEnergyKernel;
-    cl::Kernel scaleAtomsVelocitiesKernel;
-    cl::Kernel scalePairsVelocitiesKernel;
-};
 /**
 * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
 */

--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -132,9 +132,9 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CommonApplyAndersenThermostatKernel(name, platform, cl);
    if (name == NoseHooverChainKernel::Name())
-        return new OpenCLNoseHooverChainKernel(name, platform, cl);
+        return new CommonNoseHooverChainKernel(name, platform, cl);
    if (name == IntegrateVelocityVerletStepKernel::Name())
-        return new OpenCLIntegrateVelocityVerletStepKernel(name, platform, cl);
+        return new CommonIntegrateVelocityVerletStepKernel(name, platform, cl);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new OpenCLApplyMonteCarloBarostatKernel(name, platform, cl);
    if (name == RemoveCMMotionKernel::Name())

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/kernels/velocityVerlet.cl
+++ b/platforms/opencl/src/kernels/velocityVerlet.cl
-/**
- * Perform the first step of Velocity Verlet integration.
- */
-__kernel void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedNumAtoms, __global const mixed2* restrict dt, __global const real4* restrict posq,
-        __global const real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
-        __global const int* restrict atomList, __global const int2* restrict pairList) {
-    const mixed2 stepSize = dt[0];
-    const mixed dtPos = stepSize.y;
-    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
-    int index = get_global_id(0);
-    while (index < numAtoms) {
-        int atom = atomList[index];
-        mixed4 velocity = velm[atom];
-        if (velocity.w != 0.0) {
-#ifdef USE_MIXED_PRECISION
-            real4 pos1 = posq[atom];
-            real4 pos2 = posqCorrection[atom];
-            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-            real4 pos = posq[atom];
-#endif
-            velocity.x += 0.5f * dtVel*force[atom].x*velocity.w;
-            velocity.y += 0.5f * dtVel*force[atom].y*velocity.w;
-            velocity.z += 0.5f * dtVel*force[atom].z*velocity.w;
-            pos.x = velocity.x*dtPos;
-            pos.y = velocity.y*dtPos;
-            pos.z = velocity.z*dtPos;
-            posDelta[atom] = pos;
-            velm[atom] = velocity;
-        }
-        index += get_global_size(0);
-    }
-    index = get_global_id(0);
-    while (index < numPairs){
-        int atom1 = pairList[index].x;
-        int atom2 = pairList[index].y;
-        mixed4 v1 = velm[atom1];
-        mixed4 v2 = velm[atom2];
-        mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
-        mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
-        mixed mass1fract = m1 / (m1 + m2);
-        mixed mass2fract = m2 / (m1 + m2);
-        mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
-        mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
-        mixed3 comVel;
-        comVel.x= v1.x*mass1fract + v2.x*mass2fract;
-        comVel.y= v1.y*mass1fract + v2.y*mass2fract;
-        comVel.z= v1.z*mass1fract + v2.z*mass2fract;
-        mixed3 relVel;
-        relVel.x= v2.x - v1.x;
-        relVel.y= v2.y - v1.y;
-        relVel.z= v2.z - v1.z;
-        mixed3 comFrc;
-        comFrc.x = force[atom1].x + force[atom2].x;
-        comFrc.y = force[atom1].y + force[atom2].y;
-        comFrc.z = force[atom1].z + force[atom2].z;
-        mixed3 relFrc;
-        relFrc.x = mass1fract*force[atom2].x - mass2fract*force[atom1].x;
-        relFrc.y = mass1fract*force[atom2].y - mass2fract*force[atom1].y;
-        relFrc.z = mass1fract*force[atom2].z - mass2fract*force[atom1].z;
-        comVel.x += comFrc.x * 0.5f * dtVel * invTotMass;
-        comVel.y += comFrc.y * 0.5f * dtVel * invTotMass;
-        comVel.z += comFrc.z * 0.5f * dtVel * invTotMass;
-        relVel.x += relFrc.x * 0.5f * dtVel * invRedMass;
-        relVel.y += relFrc.y * 0.5f * dtVel * invRedMass;
-        relVel.z += relFrc.z * 0.5f * dtVel * invRedMass;
-#ifdef USE_MIXED_PRECISION
-        real4 posv1 = posq[atom1];
-        real4 posv2 = posq[atom2];
-        real4 posc1 = posqCorrection[atom1];
-        real4 posc2 = posqCorrection[atom2];
-        mixed4 pos1 = (mixed4) (posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
-        mixed4 pos2 = (mixed4) (posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
-#else
-        real4 pos1 = posq[atom1];
-        real4 pos2 = posq[atom2];
-#endif
-        if (v1.w != 0.0f) {
-            v1.x = comVel.x - relVel.x*mass2fract;
-            v1.y = comVel.y - relVel.y*mass2fract;
-            v1.z = comVel.z - relVel.z*mass2fract;
-            pos1.x = v1.x*dtPos;
-            pos1.y = v1.y*dtPos;
-            pos1.z = v1.z*dtPos;
-            posDelta[atom1] = pos1;
-            velm[atom1] = v1;
-        }
-        if (v2.w != 0.0f) {
-            v2.x = comVel.x + relVel.x*mass1fract;
-            v2.y = comVel.y + relVel.y*mass1fract;
-            v2.z = comVel.z + relVel.z*mass1fract;
-            pos2.x = v2.x*dtPos;
-            pos2.y = v2.y*dtPos;
-            pos2.z = v2.z*dtPos;
-            posDelta[atom2] = pos2;
-            velm[atom2] = v2;
-        }
-        index += get_global_size(0);
-     }
-}
-/**
- * Perform the second step of Velocity Verlet integration.
- */
-__kernel void integrateVelocityVerletPart2(int numAtoms, __global mixed2* restrict dt, __global real4* restrict posq,
-        __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const mixed4* restrict posDelta) {
-    mixed2 stepSize = dt[0];
-    int index = get_global_id(0);
-    if (index == 0)
-        dt[0].x = stepSize.y;
-    while(index < numAtoms) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-#ifdef USE_MIXED_PRECISION
-            real4 pos1 = posq[index];
-            real4 pos2 = posqCorrection[index];
-            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-            real4 pos = posq[index];
-#endif
-            mixed4 delta = posDelta[index];
-            pos.xyz += delta.xyz;
-#ifdef USE_MIXED_PRECISION
-            posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-            posq[index] = pos;
-#endif
-        }
-        index += get_global_size(0);
-    }
-}
-/**
- * Perform the third step of Velocity Verlet integration.
- */
-__kernel void integrateVelocityVerletPart3(int numAtoms, int numPairs, int paddedNumAtoms, __global mixed2* restrict dt, __global real4* restrict posq,
-        __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed4* restrict posDelta,
-        __global const int* restrict atomList, __global const int2* __restrict__ pairList) {
-    mixed2 stepSize = dt[0];
-#ifndef SUPPORTS_DOUBLE_PRECISION
-    double oneOverDt = 1.0/stepSize.y;
-#else
-    float oneOverDt = 1.0f/stepSize.y;
-    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
-#endif
-    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
-    int index = get_global_id(0);
-    if (index == 0)
-        dt[0].x = stepSize.y;
-    while(index < numAtoms) {
-        int atom = atomList[index];
-        mixed4 velocity = velm[atom];
-        if (velocity.w != 0.0) {
-            mixed4 deltaXconstrained = posDelta[atom];
-            velocity.x += 0.5f * dtVel*force[atom].x*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
-            velocity.y += 0.5f * dtVel*force[atom].y*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
-            velocity.z += 0.5f * dtVel*force[atom].z*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
-#ifdef SUPPORTS_DOUBLE_PRECISION
-            velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
-            velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
-            velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
-#endif
-            velm[atom] = velocity;
-        }
-        index += get_global_size(0);
-    }
-    index = get_global_id(0);
-    while(index < numPairs) {
-        int atom1 = pairList[index].x;
-        int atom2 = pairList[index].y;
-        mixed4 v1 = velm[atom1];
-        mixed4 v2 = velm[atom2];
-        mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
-        mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
-        mixed mass1fract = m1 / (m1 + m2);
-        mixed mass2fract = m2 / (m1 + m2);
-        mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
-        mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
-        mixed3 comVel;
-        comVel.x= v1.x*mass1fract + v2.x*mass2fract;
-        comVel.y= v1.y*mass1fract + v2.y*mass2fract;
-        comVel.z= v1.z*mass1fract + v2.z*mass2fract;
-        mixed3 relVel;
-        relVel.x= v2.x - v1.x;
-        relVel.y= v2.y - v1.y;
-        relVel.z= v2.z - v1.z;
-        mixed3 comFrc;
-        comFrc.x = force[atom1].x + force[atom2].x;
-        comFrc.y = force[atom1].y + force[atom2].y;
-        comFrc.z = force[atom1].z + force[atom2].z;
-        mixed3 relFrc;
-        relFrc.x = mass1fract*force[atom2].x - mass2fract*force[atom1].x;
-        relFrc.y = mass1fract*force[atom2].y - mass2fract*force[atom1].y;
-        relFrc.z = mass1fract*force[atom2].z - mass2fract*force[atom1].z;
-        comVel.x += comFrc.x * 0.5f * dtVel * invTotMass;
-        comVel.y += comFrc.y * 0.5f * dtVel * invTotMass;
-        comVel.z += comFrc.z * 0.5f * dtVel * invTotMass;
-        relVel.x += relFrc.x * 0.5f * dtVel * invRedMass;
-        relVel.y += relFrc.y * 0.5f * dtVel * invRedMass;
-        relVel.z += relFrc.z * 0.5f * dtVel * invRedMass;
-        if (v1.w != 0.0f) {
-            mixed4 deltaXconstrained = posDelta[atom1];
-            v1.x = comVel.x - relVel.x*mass2fract + (deltaXconstrained.x - v1.x*stepSize.y)*oneOverDt;
-            v1.y = comVel.y - relVel.y*mass2fract + (deltaXconstrained.y - v1.y*stepSize.y)*oneOverDt;
-            v1.z = comVel.z - relVel.z*mass2fract + (deltaXconstrained.z - v1.z*stepSize.y)*oneOverDt;
-#ifdef SUPPORTS_DOUBLE_PRECISION
-            v1.x += (deltaXconstrained.x - v1.x*stepSize.y)*correction;
-            v1.y += (deltaXconstrained.y - v1.y*stepSize.y)*correction;
-            v1.z += (deltaXconstrained.z - v1.z*stepSize.y)*correction;
-#endif
-            velm[atom1] = v1;
-        }
-        if (v2.w != 0.0f) {
-            mixed4 deltaXconstrained = posDelta[atom2];
-            v2.x = comVel.x + relVel.x*mass1fract + (deltaXconstrained.x - v2.x*stepSize.y)*oneOverDt;
-            v2.y = comVel.y + relVel.y*mass1fract + (deltaXconstrained.y - v2.y*stepSize.y)*oneOverDt;
-            v2.z = comVel.z + relVel.z*mass1fract + (deltaXconstrained.z - v2.z*stepSize.y)*oneOverDt;
-#ifdef SUPPORTS_DOUBLE_PRECISION
-            v2.x += (deltaXconstrained.x - v2.x*stepSize.y)*correction;
-            v2.y += (deltaXconstrained.y - v2.y*stepSize.y)*correction;
-            v2.z += (deltaXconstrained.z - v2.z*stepSize.y)*correction;
-#endif
-            velm[atom2] = v2;
-        }
-        index += get_global_size(0);
-    }
-}
-__kernel void integrateVelocityVerletHardWall(int numPairs, __global const float* restrict maxPairDistance, 
-        __global mixed2* restrict dt, __global real4* restrict posq,
-        __global real4* restrict posqCorrection, __global mixed4* restrict velm, 
-        __global const int2* restrict pairList, __global const float* __restrict__ pairTemperature) {
-    mixed dtPos = dt[0].y;
-    mixed maxDelta = (mixed) maxPairDistance[0];
-    if (maxDelta > 0){
-        int index = get_global_id(0);
-        while(index < numPairs) {
-            const mixed hardWallScale = sqrt( ((mixed) pairTemperature[index]) * ((mixed) BOLTZ));
-            int2 atom = (int2) (pairList[index].x, pairList[index].y);
-#ifdef USE_MIXED_PRECISION
-            real4 posv1 = posq[atom.x];
-            real4 posc1 = posqCorrection[atom.x];
-            mixed4 pos1 = (mixed4) (posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
-            real4 posv2 = posq[atom.y];
-            real4 posc2 = posqCorrection[atom.y];
-            mixed4 pos2 = (mixed4) (posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
-#else
-            real4 pos1 = posq[atom.x];
-            real4 pos2 = posq[atom.y];
-#endif
-            mixed3 delta = (mixed3) (
-                (mixed) (pos1.x - pos2.x),
-                (mixed) (pos1.y - pos2.y),
-                (mixed) (pos1.z - pos2.z)
-            );
-            mixed r = sqrt(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
-            mixed rInv = 1/r;
-            if (rInv*maxDelta < 1.0) {
-                // The constraint has been violated, so make the inter-particle distance "bounce"
-                // off the hard wall.
-                mixed3 bondDir = (mixed3) (delta.x * rInv, delta.y * rInv, delta.z * rInv);
-                mixed3 vel1 = (mixed3) (velm[atom.x].x, velm[atom.x].y, velm[atom.x].z);
-                mixed3 vel2 = (mixed3) (velm[atom.y].x, velm[atom.y].y, velm[atom.y].z);
-                mixed m1 = velm[atom.x].w != 0.0 ? 1.0/velm[atom.x].w : 0.0;
-                mixed m2 = velm[atom.y].w != 0.0 ? 1.0/velm[atom.y].w : 0.0;
-                mixed invTotMass = (m1 + m2 != 0.0) ? 1.0 /(m1 + m2) : 0.0;
-                mixed deltaR = r-maxDelta;
-                mixed deltaT = dtPos;
-                mixed dt = dtPos;
-                mixed dotvr1 = vel1.x*bondDir.x + vel1.y*bondDir.y + vel1.z*bondDir.z;
-                mixed3 vb1 = (mixed3) (bondDir.x*dotvr1, bondDir.y*dotvr1, bondDir.z*dotvr1);
-                mixed3 vp1 = (mixed3) (vel1.x-vb1.x, vel1.y-vb1.y, vel1.z-vb1.z);
-                if (m2 == 0) {
-                    // The parent particle is massless, so move only the Drude particle.
-                    if (dotvr1 != 0.0)
-                        deltaT = deltaR/fabs(dotvr1);
-                    if (deltaT > dtPos)
-                        deltaT = dtPos;
-                    dotvr1 = -dotvr1*hardWallScale/(fabs(dotvr1)*sqrt(m1));
-                    mixed dr = -deltaR + deltaT*dotvr1;
-                    pos1.x += bondDir.x*dr;
-                    pos1.y += bondDir.y*dr;
-                    pos1.z += bondDir.z*dr;
-                    velm[atom.x] = (mixed4) (vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom.x].w);
-#ifdef USE_MIXED_PRECISION
-                    posq[atom.x] = (real4) ((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
-                    posqCorrection[atom.x] = (real4) (pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
-#else
-                    posq[atom.x] = pos1;
-#endif
-                }
-                else {
-                    // Move both particles.
-                    mixed dotvr2 = vel2.x*bondDir.x + vel2.y*bondDir.y + vel2.z*bondDir.z;
-                    mixed3 vb2 = (mixed3) (bondDir.x*dotvr2, bondDir.y*dotvr2, bondDir.z*dotvr2);
-                    mixed3 vp2 = (mixed3) (vel2.x-vb2.x, vel2.y-vb2.y, vel2.z-vb2.z);
-                    mixed vbCMass = (m1*dotvr1 + m2*dotvr2)*invTotMass;
-                    dotvr1 -= vbCMass;
-                    dotvr2 -= vbCMass;
-                    if (dotvr1 != dotvr2)
-                        deltaT = deltaR/fabs(dotvr1-dotvr2);
-                    if (deltaT > dt)
-                        deltaT = dt;
-                    mixed vBond = hardWallScale/sqrt(m1);
-                    dotvr1 = -dotvr1*vBond*m2*invTotMass/fabs(dotvr1);
-                    dotvr2 = -dotvr2*vBond*m1*invTotMass/fabs(dotvr2);
-                    mixed dr1 = -deltaR*m2*invTotMass + deltaT*dotvr1;
-                    mixed dr2 = deltaR*m1*invTotMass + deltaT*dotvr2;
-                    dotvr1 += vbCMass;
-                    dotvr2 += vbCMass;
-                    pos1.x += bondDir.x*dr1;
-                    pos1.y += bondDir.y*dr1;
-                    pos1.z += bondDir.z*dr1;
-                    pos2.x += bondDir.x*dr2;
-                    pos2.y += bondDir.y*dr2;
-                    pos2.z += bondDir.z*dr2;
-                    velm[atom.x] = (mixed4) (vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom.x].w);
-                    velm[atom.y] = (mixed4) (vp2.x + bondDir.x*dotvr2, vp2.y + bondDir.y*dotvr2, vp2.z + bondDir.z*dotvr2, velm[atom.y].w);
-#ifdef USE_MIXED_PRECISION
-                    posq[atom.x] = (real4) ((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
-                    posq[atom.y] = (real4) ((real) pos2.x, (real) pos2.y, (real) pos2.z, (real) pos2.w);
-                    posqCorrection[atom.x] = (real4) (pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
-                    posqCorrection[atom.y] = (real4) (pos2.x-(real) pos2.x, pos2.y-(real) pos2.y, pos2.z-(real) pos2.z, 0);
-#else
-                    posq[atom.x] = pos1;
-                    posq[atom.y] = pos2;
-#endif
-                }
-            }
-            index += get_global_size(0);
-        }
-    }
-}