Correct CUDA NHC implementation

cca3e11a · Andy Simmonett · 50efef13 · cca3e11a · cca3e11a · cca3e11a
Unverified Commit cca3e11a authored Aug 27, 2019 by Andy Simmonett
8 changed files
--- a/openmmapi/src/NoseHooverIntegrator.cpp
+++ b/openmmapi/src/NoseHooverIntegrator.cpp
@@ -121,7 +121,7 @@ int NoseHooverIntegrator::addSubsystemThermostat(System& system, const std::vect
    int numForces = system.getNumForces();
    if (thermostatedPairs.size() == 0){ // remove 3 degrees of freedom from thermostats that act on absolute motions
        for (int forceNum = 0; forceNum < numForces; ++forceNum) {
-//            if (dynamic_cast<CMMotionRemover*>(&system.getForce(forceNum))) nDOF -= 3;
+            if (dynamic_cast<CMMotionRemover*>(&system.getForce(forceNum))) nDOF -= 3;
        }
    }

@@ -143,11 +143,7 @@ int NoseHooverIntegrator::addSubsystemThermostat(System& system, const std::vect
    }

    // create and add new chain
-    int chainID = 0;
-    for(const auto &c : noseHooverChains) {
-        // We need to account for the fact that a NHC with pairs thermostated has both a relative and an absolute chain
-        chainID += c.getThermostatedPairs().size() ? 2 : 1;
-    }
+    int chainID = noseHooverChains.size();
    auto nhc = NoseHooverChain(temperature, relativeTemperature, collisionFrequency, relativeCollisionFrequency,
                               nDOF, chainLength, numMTS, numYoshidaSuzuki, chainID, thermostatedParticles, thermostatedPairs);
    noseHooverChains.push_back(nhc);
@@ -232,10 +228,7 @@ double NoseHooverIntegrator::computeKineticEnergy() {
    double kE = 0.0;
    if(noseHooverChains.size()) {
        for (const auto &nhc: noseHooverChains){
-            if (nhc.getThermostatedPairs().size() == 0) {
-                auto KEs = nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, true);
-                kE += std::get<0>(KEs);
-            }
+            kE += nhcKernel.getAs<NoseHooverChainKernel>().computeMaskedKineticEnergy(*context, nhc, true).first;
        }
    } else {
        kE = vvKernel.getAs<IntegrateVelocityVerletStepKernel>().computeKineticEnergy(*context, *this);

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -1731,11 +1731,11 @@ public:
     * 
     * @param context  the context in which to execute this kernel
     * @param noseHooverChain the object describing the chain to be propagated.
-     * @param kineticEnergy the kineticEnergy of the particles being thermostated by this chain.
+     * @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
     * @param timeStep the time step used by the integrator.
-     * @return the velocity scale factor to apply to the particles associated with this heat bath.
+     * @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
     */
-    virtual double propagateChain(ContextImpl& context, const NoseHooverChain &nhc, double kineticEnergy, double timeStep);
+    virtual std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
    /**
     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
     *
@@ -1753,26 +1753,29 @@ public:
     * @param downloadValue whether the computed value should be downloaded and returned.
     *
     */
-     virtual double computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);
+     virtual std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);

    /**
     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
     *
     * @param context the context in which to execute this kernel
     * @param noseHooverChain the chain whose energy is to be determined.
-     * @param scaleFactor the multiplicative factor by which velocities are scaled.
+     * @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
     */
-    virtual void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, double scaleFactor);
+    virtual void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);

 private:
+    int sumWorkGroupSize;
    CudaContext& cu;
    CudaArray scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
-    std::map<int, CudaArray> masks;
+    std::map<int, CudaArray> atomlists, pairlists;
    std::map<int, CUfunction> propagateKernels;
    CUfunction reduceEnergyKernel;
    CUfunction computeHeatBathEnergyKernel;
-    CUfunction computeMaskedKineticEnergyKernel;
-    CUfunction scaleVelocitiesKernel;
+    CUfunction computeAtomsKineticEnergyKernel;
+    CUfunction computePairsKineticEnergyKernel;
+    CUfunction scaleAtomsVelocitiesKernel;
+    CUfunction scalePairsVelocitiesKernel;
 };

 /**

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -808,7 +808,6 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
                energy += (v.x*v.x+v.y*v.y+v.z*v.z)/v.w;
        }
    }
-    
    // Restore the velocities.
    
    if (timeShift != 0)

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -8358,6 +8358,8 @@ void CudaNoseHooverChainKernel::initialize() {
    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();

    map<string, string> defines;
+    sumWorkGroupSize = 512;
+    defines["WORK_GROUP_SIZE"] = cu.intToString(sumWorkGroupSize);
    defines["MIXEDEXP"] = useDouble ? "exp" : "expf";
    defines["BEGIN_YS_LOOP"] = "for(const real & ys : {1}) {";
    defines["END_YS_LOOP"] = "}";
@@ -8369,56 +8371,50 @@ void CudaNoseHooverChainKernel::initialize() {
    defines["BEGIN_YS_LOOP"] = "for(const real & ys : {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065}){";
    module = cu.createModule(CudaKernelSources::noseHooverChain, defines, "-std=c++11");
    propagateKernels[5] = cu.getKernel(module, "propagateNoseHooverChain");
+    module = cu.createModule(CudaKernelSources::noseHooverChain, defines, "-std=c++11");
+    reduceEnergyKernel = cu.getKernel(module, "reduceEnergyPair");

    computeHeatBathEnergyKernel = cu.getKernel(module, "computeHeatBathEnergy");
-    computeMaskedKineticEnergyKernel = cu.getKernel(module, "computeMaskedKineticEnergy");
-    scaleVelocitiesKernel = cu.getKernel(module, "scaleVelocities");
-    CUmodule utilities = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
-    reduceEnergyKernel = cu.getKernel(utilities, "reduceEnergy");
+    computeAtomsKineticEnergyKernel = cu.getKernel(module, "computeAtomsKineticEnergy");
+    computePairsKineticEnergyKernel = cu.getKernel(module, "computePairsKineticEnergy");
+    scaleAtomsVelocitiesKernel = cu.getKernel(module, "scaleAtomsVelocities");
+    scalePairsVelocitiesKernel = cu.getKernel(module, "scalePairsVelocities");
 }

-double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, double kineticEnergy, double timeStep) {
+std::pair<double, double> CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
    cu.setAsCurrent();

    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();

    int chainID = nhc.getDefaultChainID();
+    int nAtoms = nhc.getThermostatedAtoms().size();
+    int nPairs = nhc.getThermostatedPairs().size();
    int chainLength = nhc.getDefaultChainLength();
    int numYS = nhc.getDefaultNumYoshidaSuzukiTimeSteps();
    int numMTS = nhc.getDefaultNumMultiTimeSteps();
    int numDOFs = nhc.getDefaultNumDegreesOfFreedom();
    double temperature = nhc.getDefaultTemperature();
    double frequency = nhc.getDefaultCollisionFrequency();
+    double relativeTemperature = nhc.getDefaultRelativeTemperature();
+    double relativeFrequency = nhc.getDefaultRelativeCollisionFrequency();

-    auto & chainState = cu.getIntegrationUtilities().getNoseHooverChainState();
-    if (!chainState.count(chainID)) {
-        chainState[chainID] = CudaArray();
-    }
-    if (chainState.at(chainID).getSize() != chainLength) {
-        // We need to upload the CUDA array
-        if(useDouble){
-            chainState.at(chainID).initialize<double2>(cu, chainLength, "chainState" + std::to_string(chainID));
-            std::vector<double2> zeros(chainLength, make_double2(0, 0));
-            chainState.at(chainID).upload(zeros.data());
-        } else {
-            chainState.at(chainID).initialize<float2>(cu, chainLength, "chainState" + std::to_string(chainID));
-            std::vector<float2> zeros(chainLength, make_float2(0, 0));
-            chainState.at(chainID).upload(zeros.data());
-        }
+    if (numYS != 1 && numYS != 3 && numYS != 5) {
+        throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
    }

+    auto & chainState = cu.getIntegrationUtilities().getNoseHooverChainState();
+
    if (!scaleFactorBuffer.isInitialized() ||scaleFactorBuffer.getSize() == 0) {
        if(useDouble){
-            std::vector<double> one(1);
-            scaleFactorBuffer.initialize<double>(cu, 1, "scaleFactorBuffer");
-            scaleFactorBuffer.upload(one);
+            std::vector<double2> zeros{{0,0}};
+            scaleFactorBuffer.initialize<double2>(cu, 1, "scaleFactorBuffer");
+            scaleFactorBuffer.upload(zeros);
        } else {
-            std::vector<float> one(1);
-            scaleFactorBuffer.initialize<float>(cu, 1, "scaleFactorBuffer");
-            scaleFactorBuffer.upload(one);
+            std::vector<float2> zeros{{0,0}};
+            scaleFactorBuffer.initialize<float2>(cu, 1, "scaleFactorBuffer");
+            scaleFactorBuffer.upload(zeros);
        }
    }
-
    std::vector<double> zeros(chainLength,0);
    if (!chainForces.isInitialized() || !chainMasses.isInitialized() ){
        if(useDouble){
@@ -8435,28 +8431,73 @@ double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const Nos
    }
    if (chainForces.getSize() < chainLength) chainMasses.resize(chainLength);
    if (chainMasses.getSize() < chainLength) chainMasses.resize(chainLength);
-    double kT = BOLTZ * temperature;
-    float kTfloat = (float) kT;
+
    float timeStepFloat = (float) timeStep;
-    float frequencyFloat = (float) frequency;
    // N.B. We ignore the incoming kineticEnergy and grab it from the device buffer instead
-    void *args[] = {
-                    &chainState[chainID].getDevicePointer(),
+    if (nAtoms) {
+        if (!chainState.count(2*chainID))  chainState[2*chainID] = CudaArray();
+        if (chainState.at(2*chainID).getSize() != chainLength) {
+            // We need to upload the CUDA array
+            if(useDouble){
+                chainState.at(2*chainID).initialize<double2>(cu, chainLength, "chainState" + std::to_string(2*chainID));
+                std::vector<double2> zeros(chainLength, make_double2(0, 0));
+                chainState.at(2*chainID).upload(zeros.data());
+            } else {
+                chainState.at(2*chainID).initialize<float2>(cu, chainLength, "chainState" + std::to_string(2*chainID));
+                std::vector<float2> zeros(chainLength, make_float2(0, 0));
+                chainState.at(2*chainID).upload(zeros.data());
+            }
+        }
+        int chainType = 0;
+        double kT = BOLTZ * temperature;
+        float kTfloat = (float) kT;
+        float frequencyFloat = (float) frequency;
+        void *args[] = {
+                    &chainState[2*chainID].getDevicePointer(),
                    &kineticEnergyBuffer.getDevicePointer(),
                    &scaleFactorBuffer.getDevicePointer(),
                    &chainMasses.getDevicePointer(),
                    &chainForces.getDevicePointer(),
-                    &chainLength, &numMTS, &numDOFs,
+                    &chainType, &chainLength, &numMTS, &numDOFs,
                    &timeStepFloat,
                    useDouble ? (void*) &kT : (void*) &kTfloat, 
                    &frequencyFloat
                    };
-    if (numYS == 1 || numYS == 3 || numYS == 5) {
        cu.executeKernel(propagateKernels[numYS], args, 1, 1);
-    } else {
-        throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
    }
-    return 0;
+    if (nPairs) {
+        if (!chainState.count(2*chainID+1)) chainState[2*chainID+1] = CudaArray();
+        if (chainState.at(2*chainID+1).getSize() != chainLength) {
+            // We need to upload the CUDA array
+            if(useDouble){
+                chainState.at(2*chainID+1).initialize<double2>(cu, chainLength, "chainState" + std::to_string(2*chainID+1));
+                std::vector<double2> zeros(chainLength, make_double2(0, 0));
+                chainState.at(2*chainID+1).upload(zeros.data());
+            } else {
+                chainState.at(2*chainID+1).initialize<float2>(cu, chainLength, "chainState" + std::to_string(2*chainID+1));
+                std::vector<float2> zeros(chainLength, make_float2(0, 0));
+                chainState.at(2*chainID+1).upload(zeros.data());
+            }
+        }
+        int chainType = 1;
+        double kT = BOLTZ * relativeTemperature;
+        int ndf = 3*nPairs;
+        float kTfloat = (float) kT;
+        float frequencyFloat = (float) relativeFrequency;
+        void *args[] = {
+                    &chainState[2*chainID+1].getDevicePointer(),
+                    &kineticEnergyBuffer.getDevicePointer(),
+                    &scaleFactorBuffer.getDevicePointer(),
+                    &chainMasses.getDevicePointer(),
+                    &chainForces.getDevicePointer(),
+                    &chainType, &chainLength, &numMTS, &ndf,
+                    &timeStepFloat,
+                    useDouble ? (void*) &kT : (void*) &kTfloat, 
+                    &frequencyFloat
+                    };
+        cu.executeKernel(propagateKernels[numYS], args, 1, 1);
+    }
+    return {0, 0};
 }

 double CudaNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
@@ -8466,14 +8507,17 @@ double CudaNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, co

    int chainID = nhc.getDefaultChainID();
    int chainLength = nhc.getDefaultChainLength();
-    int numDOFs = nhc.getDefaultNumDegreesOfFreedom();
-    double temperature = nhc.getDefaultTemperature();
-    double frequency = nhc.getDefaultCollisionFrequency();

    auto & chainState = cu.getIntegrationUtilities().getNoseHooverChainState();
-    if (chainID >= chainState.size() || !chainState[chainID].isInitialized() || chainState[chainID].getSize() != chainLength) {
-        return 0.0;
-    }
+
+    bool absChainIsValid = chainState.count(2*chainID) != 0 &&
+                           chainState[2*chainID].isInitialized() &&
+                           chainState[2*chainID].getSize() == chainLength;
+    bool relChainIsValid = chainState.count(2*chainID+1) != 0 &&
+                           chainState[2*chainID+1].isInitialized() &&
+                           chainState[2*chainID+1].getSize() == chainLength;
+
+    if (!absChainIsValid && !relChainIsValid) return 0.0;

    if (!heatBathEnergy.isInitialized() || heatBathEnergy.getSize() == 0) {
        if(useDouble){
@@ -8487,15 +8531,35 @@ double CudaNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, co
        }
    }

-    double kT = BOLTZ * temperature;
-    float kTfloat = (float) kT;
-    float frequencyFloat = (float) frequency;
+    cu.clearBuffer(heatBathEnergy);
+
+    if (absChainIsValid) {
+        int numDOFs = nhc.getDefaultNumDegreesOfFreedom();
+        double temperature = nhc.getDefaultTemperature();
+        double frequency = nhc.getDefaultCollisionFrequency();
+        double kT = BOLTZ * temperature;
+        float kTfloat = (float) kT;
+        float frequencyFloat = (float) frequency;

-    void * args[] = {&heatBathEnergy.getDevicePointer(), &chainLength, &numDOFs, 
-                     useDouble ? (void*) &kT : (void*) & kTfloat,
-                     &frequencyFloat, &chainState[chainID].getDevicePointer()};
+        void * args[] = {&heatBathEnergy.getDevicePointer(), &chainLength, &numDOFs, 
+                         useDouble ? (void*) &kT : (void*) & kTfloat,
+                         &frequencyFloat, &chainState[2*chainID].getDevicePointer()};
+        cu.executeKernel(computeHeatBathEnergyKernel, args, 1, 1);
+    }
+    if (relChainIsValid) {
+        int numDOFs = 3 * nhc.getThermostatedPairs().size();
+        double temperature = nhc.getDefaultRelativeTemperature();
+        double frequency = nhc.getDefaultRelativeCollisionFrequency();
+        double kT = BOLTZ * temperature;
+        float kTfloat = (float) kT;
+        float frequencyFloat = (float) frequency;
+
+        void * args[] = {&heatBathEnergy.getDevicePointer(), &chainLength, &numDOFs, 
+                         useDouble ? (void*) &kT : (void*) & kTfloat,
+                         &frequencyFloat, &chainState[2*chainID+1].getDevicePointer()};
+        cu.executeKernel(computeHeatBathEnergyKernel, args, 1, 1);
+    }

-    cu.executeKernel(computeHeatBathEnergyKernel, args, 1, 1);

    void * pinnedBuffer = cu.getPinnedBuffer();
    heatBathEnergy.download(pinnedBuffer);
@@ -8506,78 +8570,94 @@ double CudaNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, co
    }
 }

-double CudaNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
+std::pair<double, double> CudaNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
    cu.setAsCurrent();

    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();

    int chainID = nhc.getDefaultChainID();
-    const auto & parents = nhc.getThermostatedPairs();
-    const auto & thermostatedAtoms = nhc.getThermostatedAtoms();
-    auto nAtoms = cu.getPaddedNumAtoms();
-    if (!masks.count(chainID)) { 
-        // We need to upload the CUDA array
-        std::vector<int> maskVec(nAtoms);
-        for (int i = 0; i < nAtoms; ++i) maskVec[i] = i;
-
-        if (parents.size() ) {
-            assert(parents.size() == thermostatedAtoms.size());
-            for ( size_t i = 0; i < parents.size(); ++i) {
-                int atom = thermostatedAtoms[i];
-                maskVec[atom] = parents[i];
-            }
-        } else {
-            for ( const auto & atom : thermostatedAtoms) {
-                maskVec[atom] = -1L;
-            }
+    const auto & nhcAtoms = nhc.getThermostatedAtoms();
+    const auto & nhcPairs = nhc.getThermostatedPairs();
+    auto nAtoms = nhcAtoms.size();
+    auto nPairs = nhcPairs.size();
+    if (nAtoms) {
+        if (!atomlists.count(chainID)) { 
+            // We need to upload the CUDA array
+            atomlists[chainID] = CudaArray();
+            atomlists[chainID].initialize<int>(cu, nAtoms, "atomlist" + std::to_string(chainID));
+            atomlists[chainID].upload(nhcAtoms);
+        }
+        if (atomlists[chainID].getSize() != nAtoms) {
+            throw OpenMMException("Number of atoms changed. Cannot be handled by the same Nose-Hoover thermostat.");
+        }
+    }
+    if (nPairs) {
+        if (!pairlists.count(chainID)) { 
+            // We need to upload the CUDA array
+            pairlists[chainID] = CudaArray();
+            pairlists[chainID].initialize<int2>(cu, nPairs, "pairlist" + std::to_string(chainID));
+            std::vector<int2> int2vec;
+            for(const auto &p : nhcPairs) int2vec.push_back(make_int2(p.first, p.second));
+            pairlists[chainID].upload(int2vec);
+        }
+        if (pairlists[chainID].getSize() != nPairs) {
+            throw OpenMMException("Number of thermostated pairs changed. Cannot be handled by the same Nose-Hoover thermostat.");
        }
-        masks[chainID] = CudaArray();
-        masks[chainID].initialize<int>(cu, nAtoms, "mask" + std::to_string(chainID));
-        masks[chainID].upload(maskVec);
-    }
-    if (masks[chainID].getSize() != nAtoms) {
-        throw OpenMMException("Number of atoms changed. Cannot be handled by the same Nose-Hoover thermostat.");
    }
    if (!kineticEnergyBuffer.isInitialized() || kineticEnergyBuffer.getSize() == 0) {
        if(useDouble){
-            std::vector<double> one(1);
-            kineticEnergyBuffer.initialize<double>(cu, 1, "kineticEnergyBuffer");
-            kineticEnergyBuffer.upload(one);
+            std::vector<double2> zeros{{0,0}};
+            kineticEnergyBuffer.initialize<double2>(cu, 1, "kineticEnergyBuffer");
+            kineticEnergyBuffer.upload(zeros);
        } else {
-            std::vector<float> one(1);
-            kineticEnergyBuffer.initialize<float>(cu, 1, "kineticEnergyBuffer");
-            kineticEnergyBuffer.upload(one);
+            std::vector<float2> zeros{{0,0}};
+            kineticEnergyBuffer.initialize<float2>(cu, 1, "kineticEnergyBuffer");
+            kineticEnergyBuffer.upload(zeros);
        }
    }
    cu.clearBuffer(cu.getEnergyBuffer());
-    void *args[] = {&cu.getEnergyBuffer().getDevicePointer(),&nAtoms, &cu.getVelm().getDevicePointer(), &masks[chainID].getDevicePointer()};
-    cu.executeKernel(computeMaskedKineticEnergyKernel, args, nAtoms);
+    if (nAtoms) {
+        void *args[] = {&cu.getEnergyBuffer().getDevicePointer(),&nAtoms, &cu.getVelm().getDevicePointer(), &atomlists[chainID].getDevicePointer()};
+        cu.executeKernel(computeAtomsKineticEnergyKernel, args, nAtoms);
+    }
+    if (nPairs) {
+        void *args[] = {&cu.getEnergyBuffer().getDevicePointer(),&nPairs, &cu.getVelm().getDevicePointer(), &pairlists[chainID].getDevicePointer()};
+        cu.executeKernel(computePairsKineticEnergyKernel, args, nPairs);
+    }

    //taken from CudaContext::reduceEnergy(); the final kinetic energy will live in the kineticEnergy buffer
-    int bufferSize = cu.getEnergyBuffer().getSize();
-    int workGroupSize  = 512;
-    void* args2[] = {&cu.getEnergyBuffer().getDevicePointer(), &kineticEnergyBuffer.getDevicePointer(), &bufferSize, &workGroupSize};
-    cu.executeKernel(reduceEnergyKernel, args2, workGroupSize, workGroupSize, workGroupSize*cu.getEnergyBuffer().getElementSize());
+    int bufferSize = cu.getEnergyBuffer().getSize() / 2; // Halve it to account for the fact that we're storing mixed2 instead of mixed in there
+    void* args2[] = {&cu.getEnergyBuffer().getDevicePointer(), &kineticEnergyBuffer.getDevicePointer(), &bufferSize, &sumWorkGroupSize};
+    cu.executeKernel(reduceEnergyKernel, args2, sumWorkGroupSize, sumWorkGroupSize, 2*sumWorkGroupSize*cu.getEnergyBuffer().getElementSize());

-    double value = 0;
+    std::pair<double, double> KEs = {0, 0};
    if (downloadValue) {
        void * pinnedBuffer = cu.getPinnedBuffer();
        kineticEnergyBuffer.download(pinnedBuffer);
-        value = useDouble ? *((double*) pinnedBuffer) : *((float*) pinnedBuffer);
+        KEs.first = useDouble ? *((double*) pinnedBuffer) : *((float*) pinnedBuffer);
+        KEs.second = useDouble ? *((double*) pinnedBuffer + 1) : *((float*) pinnedBuffer + 1);
    }
-    return value;
+    return KEs;
 }

-void CudaNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, double scaleFactor) {
-    // For now we assume that the mask info is valid, because computeMaskedKineticEnergy must have been
+void CudaNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
+    // For now we assume that the atoms and pairs info is valid, because compute{Atoms|Pairs}KineticEnergy must have been
    // called before this kernel.  If that ever ceases to be true, some sanity checks are needed here.
    cu.setAsCurrent();

-    auto nAtoms = cu.getPaddedNumAtoms();
    int chainID = nhc.getDefaultChainID();
-    void *args[] = {&scaleFactorBuffer.getDevicePointer(),
-                    &nAtoms, &cu.getVelm().getDevicePointer(), &masks[chainID].getDevicePointer()};
-    cu.executeKernel(scaleVelocitiesKernel, args, nAtoms);
+    auto nAtoms = nhc.getThermostatedAtoms().size();
+    auto nPairs = nhc.getThermostatedPairs().size();
+    if(nAtoms) {
+        void *args[] = {&scaleFactorBuffer.getDevicePointer(),
+                        &nAtoms, &cu.getVelm().getDevicePointer(), &atomlists[chainID].getDevicePointer()};
+        cu.executeKernel(scaleAtomsVelocitiesKernel, args, nAtoms);
+    }
+    if(nPairs) {
+        void *args[] = {&scaleFactorBuffer.getDevicePointer(),
+                        &nPairs, &cu.getVelm().getDevicePointer(), &pairlists[chainID].getDevicePointer()};
+        cu.executeKernel(scalePairsVelocitiesKernel, args, nPairs);
+    }
 }

 void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {

--- a/platforms/cuda/src/kernels/noseHooverChain.cu
+++ b/platforms/cuda/src/kernels/noseHooverChain.cu

 #include <initializer_list>

-extern "C" __global__ void propagateNoseHooverChain(mixed2* __restrict__ chainData, const mixed * __restrict__ energySum, mixed* __restrict__ scaleFactor,
+extern "C" __global__ void propagateNoseHooverChain(mixed2* __restrict__ chainData, const mixed2 * __restrict__ energySum, mixed2* __restrict__ scaleFactor,
                                                    mixed* __restrict__ chainMasses, mixed* __restrict__ chainForces, 
-                                                    int chainLength, int numMTS, int numDOFs, float timeStep,
+                                                    int chainType, int chainLength, int numMTS, int numDOFs, float timeStep,
                                                    mixed kT, float frequency){
-    mixed &scale = scaleFactor[0];
+    const mixed & kineticEnergy = chainType ? energySum[0].y : energySum[0].x;
+    mixed &scale = chainType ? scaleFactor[0].y : scaleFactor[0].x;
    scale = (mixed) 1;
-    const mixed & kineticEnergy = energySum[0];
    if(kineticEnergy < 1e-8) return;
    for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
    chainMasses[0] *= numDOFs;
@@ -49,11 +49,10 @@ extern "C" __global__ void propagateNoseHooverChain(mixed2* __restrict__ chainDa
 /**
 * Compute total (potential + kinetic) energy of the Nose-Hoover beads
 */
-extern "C" __global__ void computeHeatBathEnergy(mixed* __restrict__ heatBathEnergy, int chainLength, int numDOFs, 
+extern "C" __global__ void computeHeatBathEnergy(mixed* __restrict__ heatBathEnergy, int chainLength, int numDOFs,
                                                 mixed kT, float frequency, const mixed2* __restrict__ chainData){
-
+    // Note that this is always incremented; make sure it's zeroed properly before the first call
    mixed &energy = heatBathEnergy[0];
-    energy = (mixed) 0;

    for(int i = 0; i < chainLength; ++i) {
        mixed prefac = i ? 1 : numDOFs;
@@ -67,35 +66,105 @@ extern "C" __global__ void computeHeatBathEnergy(mixed* __restrict__ heatBathEne
    }
 }

-extern "C" __global__ void computeMaskedKineticEnergy(mixed * __restrict__ energyBuffer, int paddedNumAtoms,
-                                                      const mixed4* __restrict__ velm, const int *__restrict__ mask){
-    mixed energy = 0;
+extern "C" __global__ void computeAtomsKineticEnergy(mixed2 * __restrict__ energyBuffer, int numAtoms,
+                                                     const mixed4* __restrict__ velm, const int *__restrict__ atoms){
+    mixed2 energy = make_mixed2(0,0);
    //energy = 1; return;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < paddedNumAtoms; index += blockDim.x*gridDim.x) {
-        mixed4 v = velm[index];
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        int atom = atoms[index];
+        mixed4 v = velm[atom];
        mixed mass = v.w == 0 ? 0 : 1 / v.w;
-        if (mask[index] >= 0){
-            const mixed4& vparent = velm[mask[index]];
-            mixed massp = vparent.w == 0 ? 0 : 1/vparent.w;
-            mass = (massp + mass) == 0 ? 0 : (massp * mass) / ( massp + mass );
-            v.x -= vparent.x;
-            v.y -= vparent.y;
-            v.z -= vparent.z;
-        }
-        energy += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
+        energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = energy;
 }

-extern "C" __global__ void scaleVelocities(mixed * __restrict__ scaleFactor, int paddedNumAtoms,
-                                           mixed4* __restrict__ velm, const int *__restrict__ mask){
-    const mixed &scale = scaleFactor[0];
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < paddedNumAtoms; index += blockDim.x*gridDim.x) {
-        mixed4 &v = velm[index];
-        mixed4 vparent = mask[index] >= 0 ? velm[mask[index]] : make_mixed4(0.0f, 0.0f, 0.0f, 0.0f);
-        mixed maskedScale = mask[index] == index ? 1 : scale;
-        v.x = vparent.x + maskedScale * (v.x - vparent.x);
-        v.y = vparent.y + maskedScale * (v.y - vparent.y);
-        v.z = vparent.z + maskedScale * (v.z - vparent.z);
+extern "C" __global__ void computePairsKineticEnergy(mixed2 * __restrict__ energyBuffer, int numPairs,
+                                                     const mixed4* __restrict__ velm, const int2 *__restrict__ pairs){
+    mixed2 energy = make_mixed2(0,0);
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+        int2 pair = pairs[index];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
+        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
+        mixed4 cv;
+        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
+        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
+        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
+        mixed4 rv;
+        rv.x = v2.x - v1.x;
+        rv.y = v2.y - v1.y;
+        rv.z = v2.z - v1.z;
+        energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
+        energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
+    }
+    // The atoms version of this has been called already, so accumulate instead of assigning here
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x].x += energy.x;
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x].y += energy.y;
+}
+
+extern "C" __global__ void scaleAtomsVelocities(mixed2* __restrict__ scaleFactor, int numAtoms,
+                                                mixed4* __restrict__ velm, const int *__restrict__ atoms){
+    const mixed &scale = scaleFactor[0].x;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        int atom = atoms[index];
+        mixed4 &v = velm[atom];
+        v.x *= scale;
+        v.y *= scale;
+        v.z *= scale;
+    }
+}
+
+extern "C" __global__ void scalePairsVelocities(mixed2 * __restrict__ scaleFactor, int numPairs,
+                                                mixed4* __restrict__ velm, const int2 *__restrict__ pairs){
+    const mixed &absScale = scaleFactor[0].x;
+    const mixed &relScale = scaleFactor[0].y;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
+        int atom1 = pairs[index].x;
+        int atom2 = pairs[index].y;
+        mixed4 &v1 = velm[atom1];
+        mixed4 &v2 = velm[atom2];
+        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
+        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
+        mixed4 cv;
+        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
+        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
+        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
+        mixed4 rv;
+        rv.x = v2.x - v1.x;
+        rv.y = v2.y - v1.y;
+        rv.z = v2.z - v1.z;
+        v1.x = absScale * cv.x - relScale * rv.x * m2 / (m1 + m2);
+        v1.y = absScale * cv.y - relScale * rv.y * m2 / (m1 + m2);
+        v1.z = absScale * cv.z - relScale * rv.z * m2 / (m1 + m2);
+        v2.x = absScale * cv.x + relScale * rv.x * m1 / (m1 + m2);
+        v2.y = absScale * cv.y + relScale * rv.y * m1 / (m1 + m2);
+        v2.z = absScale * cv.z + relScale * rv.z * m1 / (m1 + m2);
+    }
+}
+
+/**
+ * Sum the energy buffer containing a pair of energies stored as mixed2.  This is copied from utilities.cu with small modifications
+ */
+extern "C" __global__ void reduceEnergyPair(const mixed2* __restrict__ energyBuffer, mixed2* __restrict__ result, int bufferSize, int workGroupSize) {
+    __shared__ mixed2 tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = threadIdx.x;
+    mixed2 sum = make_mixed2(0,0);
+    for (unsigned int idx = thread; idx < bufferSize; idx += blockDim.x) {
+        sum.x += energyBuffer[idx].x;
+        sum.y += energyBuffer[idx].y;
+    }
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < workGroupSize; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < workGroupSize) {
+            tempBuffer[thread].x += tempBuffer[thread+i].x;
+            tempBuffer[thread].y += tempBuffer[thread+i].y;
+        }
    }
+    if (thread == 0)
+        *result = tempBuffer[0];
 }
--- a/platforms/cuda/src/kernels/utilities.cu
+++ b/platforms/cuda/src/kernels/utilities.cu
@@ -99,4 +99,4 @@ __global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq,
    for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x)
        posq[i].w = charges[atomOrder[i]];
 }
-}
\ No newline at end of file
+}
--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -2477,54 +2477,58 @@ std::pair<double, double> ReferenceNoseHooverChainKernel::propagateChain(Context
    // Get the variables describing the NHC
    int chainLength = nhc.getDefaultChainLength();
    int chainID = nhc.getDefaultChainID();
-    double temperature = nhc.getDefaultTemperature();
-    double collisionFrequency = nhc.getDefaultCollisionFrequency();
    int numDOFs = nhc.getDefaultNumDegreesOfFreedom();
    int numMTS = nhc.getDefaultNumMultiTimeSteps();

    // Get the state of the NHC from the context
    auto& allChainPositions = extractNoseHooverPositions(context);
    auto& allChainVelocities = extractNoseHooverVelocities(context);
-    if (allChainPositions.size() < chainID+1){
-        allChainPositions.resize(chainID+1);
-    }
-    if (allChainVelocities.size() < chainID+1){
-        allChainVelocities.resize(chainID+1);
-    }
-    auto& chainPositions = allChainPositions.at(chainID);
-    auto& chainVelocities = allChainVelocities.at(chainID);
-    if (chainPositions.size() < chainLength){
-        chainPositions.resize(chainLength, 0);
-    }
-    if (chainVelocities.size() < chainLength){
-        chainVelocities.resize(chainLength, 0);
+
+    int nAtoms = nhc.getThermostatedAtoms().size();
+    double absScale = 0;
+    if (nAtoms) {
+        if (allChainPositions.size() < 2*chainID+1){
+            allChainPositions.resize(2*chainID+1);
+        }
+        if (allChainVelocities.size() < 2*chainID+1){
+            allChainVelocities.resize(2*chainID+1);
+        }
+        auto& chainPositions = allChainPositions.at(2*chainID);
+        auto& chainVelocities = allChainVelocities.at(2*chainID);
+        if (chainPositions.size() < chainLength){
+            chainPositions.resize(chainLength, 0);
+        }
+        if (chainVelocities.size() < chainLength){
+            chainVelocities.resize(chainLength, 0);
+        }
+        double temperature = nhc.getDefaultTemperature();
+        double collisionFrequency = nhc.getDefaultCollisionFrequency();
+        absScale = chainPropagator->propagate(absKE, chainVelocities, chainPositions, numDOFs,
+                                              temperature, collisionFrequency, timeStep,
+                                              numMTS, nhc.getDefaultYoshidaSuzukiWeights());
    }
-    double absScale = chainPropagator->propagate(absKE, chainVelocities, chainPositions, numDOFs,
-                                                 temperature, collisionFrequency, timeStep,
-                                                 numMTS, nhc.getDefaultYoshidaSuzukiWeights());
    double relScale = 0;
-    int numPairs = nhc.getThermostatedPairs().size();
-    if(numPairs){
-        double relativeTemperature = nhc.getDefaultRelativeTemperature();
-        double relativeCollisionFrequency = nhc.getDefaultRelativeCollisionFrequency();
-        if (allChainPositions.size() < chainID+2){
-            allChainPositions.resize(chainID+2);
+    int nPairs = nhc.getThermostatedPairs().size();
+    if (nPairs) {
+        if (allChainPositions.size() < 2*chainID+2){
+            allChainPositions.resize(2*chainID+2);
        }
-        if (allChainVelocities.size() < chainID+2){
-            allChainVelocities.resize(chainID+2);
+        if (allChainVelocities.size() < 2*chainID+2){
+            allChainVelocities.resize(2*chainID+2);
        }
-        auto& chainPositions = allChainPositions.at(chainID+1);
-        auto& chainVelocities = allChainVelocities.at(chainID+1);
+        auto& chainPositions = allChainPositions.at(2*chainID+1);
+        auto& chainVelocities = allChainVelocities.at(2*chainID+1);
        if (chainPositions.size() < chainLength){
            chainPositions.resize(chainLength, 0);
        }
        if (chainVelocities.size() < chainLength){
            chainVelocities.resize(chainLength, 0);
        }
-        relScale = chainPropagator->propagate(relKE, chainVelocities, chainPositions, 3*numPairs,
-                                              relativeTemperature, relativeCollisionFrequency, timeStep,
+        double temperature = nhc.getDefaultRelativeTemperature();
+        double collisionFrequency = nhc.getDefaultRelativeCollisionFrequency();
+        relScale = chainPropagator->propagate(relKE, chainVelocities, chainPositions, 3*nPairs,
+                                              temperature, collisionFrequency, timeStep,
                                              numMTS, nhc.getDefaultYoshidaSuzukiWeights());
-
    }
    return {absScale, relScale};
 }
@@ -2534,21 +2538,41 @@ double ReferenceNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& contex
    double kineticEnergy = 0;
    int chainLength = nhc.getDefaultChainLength();
    int chainID = nhc.getDefaultChainID();
-    double temperature = nhc.getDefaultTemperature();
-    double collisionFrequency = nhc.getDefaultCollisionFrequency();
-    double kT = temperature * BOLTZ;
-    int numDOFs = nhc.getDefaultNumDegreesOfFreedom();
+    int nAtoms = nhc.getThermostatedAtoms().size();
+    int nPairs = nhc.getThermostatedPairs().size();
    auto& nhcPositions = extractNoseHooverPositions(context);
    auto& nhcVelocities = extractNoseHooverVelocities(context);
-    for(int i = 0; i < chainLength; ++i) {
-        double prefac = i ? 1 : numDOFs;
-        double mass = prefac * kT / (collisionFrequency * collisionFrequency);
-        double velocity = nhcVelocities[chainID][i];
-        // The kinetic energy of this bead
-        kineticEnergy += 0.5 * mass * velocity * velocity;
-        // The potential energy of this bead
-        double position = nhcPositions[chainID][i];
-        potentialEnergy += prefac * kT * position;
+    if (nAtoms) {
+        double temperature = nhc.getDefaultTemperature();
+        double collisionFrequency = nhc.getDefaultCollisionFrequency();
+        double kT = temperature * BOLTZ;
+        int numDOFs = nhc.getDefaultNumDegreesOfFreedom();
+        for(int i = 0; i < chainLength; ++i) {
+            double prefac = i ? 1 : numDOFs;
+            double mass = prefac * kT / (collisionFrequency * collisionFrequency);
+            double velocity = nhcVelocities[2*chainID][i];
+            // The kinetic energy of this bead
+            kineticEnergy += 0.5 * mass * velocity * velocity;
+            // The potential energy of this bead
+            double position = nhcPositions[2*chainID][i];
+            potentialEnergy += prefac * kT * position;
+        }
+    }
+    if (nPairs) {
+        double temperature = nhc.getDefaultRelativeTemperature();
+        double collisionFrequency = nhc.getDefaultRelativeCollisionFrequency();
+        double kT = temperature * BOLTZ;
+        int numDOFs = 3 * nPairs;
+        for(int i = 0; i < chainLength; ++i) {
+            double prefac = i ? 1 : numDOFs;
+            double mass = prefac * kT / (collisionFrequency * collisionFrequency);
+            double velocity = nhcVelocities[2*chainID+1][i];
+            // The kinetic energy of this bead
+            kineticEnergy += 0.5 * mass * velocity * velocity;
+            // The potential energy of this bead
+            double position = nhcPositions[2*chainID+1][i];
+            potentialEnergy += prefac * kT * position;
+        }
    }
    return kineticEnergy + potentialEnergy;
 }

--- a/plugins/drude/tests/TestDrudeNoseHoover.h
+++ b/plugins/drude/tests/TestDrudeNoseHoover.h
@@ -117,8 +117,8 @@ void testWaterBox(Platform& platform){
    int chainLength = 4;
    int numMTS = 3;
    int numYS = 3;
-    double frequency = 50.0;
-    double frequencyDrude = 25.0;
+    double frequency = 100.0;
+    double frequencyDrude = 80.0;
    int randomSeed = 100;
    DrudeNoseHooverIntegrator integ(0.0005, system, temperature, temperatureDrude,
                                    frequency, frequencyDrude, chainLength, numMTS, numYS);;
@@ -138,12 +138,12 @@ void testWaterBox(Platform& platform){
    context.applyConstraints(1e-6);
    // Equilibrate.
    
-    integ.step(5000);
+    integ.step(800);
    
    // Compute the internal and center of mass temperatures.
    
    double totalKE = 0;
-    int numSteps = 100000;
+    const int numSteps = 800;
    double meanTemp = 0.0;
    double meanDrudeTemp = 0.0;
    double meanConserved = 0.0;
@@ -154,7 +154,6 @@ void testWaterBox(Platform& platform){
        double PE = state.getPotentialEnergy();
        double fullKE = integ.computeTotalKineticEnergy();
        double drudeKE = integ.computeDrudeKineticEnergy();
-        KE = fullKE - drudeKE;
        double temp = KE/(0.5*numStandardDof*BOLTZ);
        double drudeTemp = drudeKE/(0.5*numDrudeDof*BOLTZ);
        meanTemp = (i*meanTemp + temp)/(i+1);
@@ -162,7 +161,7 @@ void testWaterBox(Platform& platform){
        double heatBathEnergy = integ.computeHeatBathEnergy();
        double conserved = PE + fullKE + heatBathEnergy;
        meanConserved = (i*meanConserved + conserved)/(i+1);
-#define DEBUG 1
+#define DEBUG 0
 #if DEBUG
        if(i%10 == 0)
        std::cout << std::setw(6) << i
@@ -170,15 +169,17 @@ void testWaterBox(Platform& platform){
                  << std::setprecision(8) << std::setw(16) << drudeKE
                  << std::setprecision(8) << std::setw(16) << meanTemp
                  << std::setprecision(8) << std::setw(16) << meanDrudeTemp
+                  << std::setprecision(8) << std::setw(16) << heatBathEnergy
+                  << std::setprecision(8) << std::setw(16) << fullKE
                  << std::setprecision(8) << std::setw(16) << conserved
                  << std::endl;
 #endif
        totalKE += KE;
-//        ASSERT(fabs(meanConserved - conserved) < 0.5);
+        ASSERT(fabs(meanConserved - conserved) < 0.2);
    }
    totalKE /= numSteps;
-//    ASSERT_USUALLY_EQUAL_TOL(temperature, meanTemp, 0.02);
-//    ASSERT_USUALLY_EQUAL_TOL(temperatureDrude, meanDrudeTemp, 0.10);
+    ASSERT_USUALLY_EQUAL_TOL(temperature, meanTemp, 0.03);
+    ASSERT_USUALLY_EQUAL_TOL(temperatureDrude, meanDrudeTemp, 0.03);
 }

 int main(int argc, char* argv[]) {