Nose-Hoover working on CUDA

6307eb0d · Andy Simmonett · 799d4b1f · 6307eb0d · 6307eb0d · 6307eb0d
Unverified Commit 6307eb0d authored May 16, 2019 by Andy Simmonett
8 changed files
--- a/openmmapi/src/VelocityVerletIntegrator.cpp
+++ b/openmmapi/src/VelocityVerletIntegrator.cpp
@@ -64,11 +64,10 @@ int VelocityVerletIntegrator::addNoseHooverChainThermostat(System& system, doubl
    if (context) {
        throw OpenMMException("addNoseHooverChainThermostat cannot be called after binding this integrator to a context.");
    }
-    std::vector<int> mask, parents;
    int nDOF = 0;
    int numForces = system.getNumForces();
-    vector<int> thermostatedParticles;
+    std::vector<int> thermostatedParticles;
-    vector<int> parentParticles;
+    std::vector<int> parentParticles;
    for(int particle = 0; particle < system.getNumParticles(); ++particle) {
        if(system.getParticleMass(particle) > 0) {
            nDOF += 3;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -1762,9 +1762,14 @@ public:
 private:
    CudaContext& cu;
-    CudaArray scaleFactor, chainMasses, chainForces, heatBathEnergy;
+    CudaArray scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
+    std::vector<CudaArray> masks;
    std::map<int, CUfunction> propagateKernels;
+    CUfunction reduceEnergyKernel;
    CUfunction computeHeatBathEnergyKernel;
+    CUfunction computeMaskedKineticEnergyKernel;
+    CUfunction scaleVelocitiesKernel;
+    CUfunction zeroEnergyBuffersKernel;
 };
 /**

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -57,6 +57,7 @@
 #include <cmath>
 #include <iterator>
 #include <set>
+#include <assert.h>
 using namespace OpenMM;
 using namespace std;
@@ -8365,6 +8366,10 @@ void CudaNoseHooverChainKernel::initialize() {
    propagateKernels[5] = cu.getKernel(module, "propagateNoseHooverChain");
    computeHeatBathEnergyKernel = cu.getKernel(module, "computeHeatBathEnergy");
+    computeMaskedKineticEnergyKernel = cu.getKernel(module, "computeMaskedKineticEnergy");
+    scaleVelocitiesKernel = cu.getKernel(module, "scaleVelocities");
+    CUmodule utilities = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
+    reduceEnergyKernel = cu.getKernel(utilities, "reduceEnergy");
 }
 double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, double kineticEnergy, double timeStep) {
@@ -8386,24 +8391,24 @@ double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const Nos
        // We need to upload the CUDA array
        if(useDouble){
            chainState[chainID].initialize<double2>(cu, chainLength, "chainState" + std::to_string(chainID));
-            std::vector<double2> zeros(chainLength, {0.0, 0.0});
+            std::vector<double2> zeros(chainLength, make_double2(0, 0));
-            chainState[chainID].upload(zeros);
+            chainState[chainID].upload(zeros.data());
        } else {
            chainState[chainID].initialize<float2>(cu, chainLength, "chainState" + std::to_string(chainID));
-            std::vector<float2> zeros(chainLength, {0.0, 0.0});
+            std::vector<float2> zeros(chainLength, make_float2(0, 0));
-            chainState[chainID].upload(zeros);
+            chainState[chainID].upload(zeros.data());
        }
    }
-    if (!scaleFactor.isInitialized() ||scaleFactor.getSize() == 0) {
+    if (!scaleFactorBuffer.isInitialized() ||scaleFactorBuffer.getSize() == 0) {
        if(useDouble){
            std::vector<double> one(1);
-            scaleFactor.initialize<double>(cu, 1, "scaleFactor");
+            scaleFactorBuffer.initialize<double>(cu, 1, "scaleFactorBuffer");
-            scaleFactor.upload(one);
+            scaleFactorBuffer.upload(one);
        } else {
            std::vector<float> one(1);
-            scaleFactor.initialize<float>(cu, 1, "scaleFactor");
+            scaleFactorBuffer.initialize<float>(cu, 1, "scaleFactorBuffer");
-            scaleFactor.upload(one);
+            scaleFactorBuffer.upload(one);
        }
    }
@@ -8422,23 +8427,24 @@ double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const Nos
            chainForces.upload(zeros);
        }
    }
    double kT = BOLTZ * temperature;
    float kTfloat = (float) kT;
    float timeStepFloat = (float) timeStep;
    float frequencyFloat = (float) frequency;
-    float kineticEnergyFloat = (float) kineticEnergy;
+    // N.B. We ignore the incoming kineticEnergy and grab it from the device buffer instead
+    void *args[] = {
-    void *args[] = {&scaleFactor.getDevicePointer(),
+                    &chainState[chainID].getDevicePointer(),
+                    &kineticEnergyBuffer.getDevicePointer(),
+                    &scaleFactorBuffer.getDevicePointer(),
                    &chainMasses.getDevicePointer(),
                    &chainForces.getDevicePointer(),
                    &chainLength, &numMTS, &numDOFs,
                    &timeStepFloat,
                    useDouble ? (void*) &kT : (void*) &kTfloat, 
-                    &frequencyFloat,
+                    &frequencyFloat
-                    useDouble ? (void*) &kineticEnergy : (void*) &kineticEnergyFloat, &chainState[chainID].getDevicePointer()};
+                    };
    if (numYS == 1 || numYS == 3 || numYS == 5) {
-        cu.executeKernel(propagateKernels[numYS], args, 1);
+        cu.executeKernel(propagateKernels[numYS], args, 1, 1);
    } else {
        throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
    }
@@ -8481,25 +8487,87 @@ double CudaNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, co
                     useDouble ? (void*) &kT : (void*) & kTfloat,
                     &frequencyFloat, &chainState[chainID].getDevicePointer()};
-    cu.executeKernel(computeHeatBathEnergyKernel, args, 1);
+    cu.executeKernel(computeHeatBathEnergyKernel, args, 1, 1);
+    void * pinnedBuffer = cu.getPinnedBuffer();
+    heatBathEnergy.download(pinnedBuffer);
    if (useDouble){
-        std::vector<double> energy(1);
+        return *((double*) pinnedBuffer);
-        heatBathEnergy.download(energy);
-        return energy[0];
    } else {
-        std::vector<float> energy(1);
+        return *((float*) pinnedBuffer);
-        heatBathEnergy.download(energy);
-        return (double) energy[0];
    }
 }
-double CudaNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain) {
+double CudaNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
    cu.setAsCurrent();
-    return 1;
+    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
+    int chainID = nhc.getDefaultChainID();
+    const auto & parents = nhc.getParentAtoms();
+    const auto & thermostatedAtoms = nhc.getThermostatedAtoms();
+    if (chainID >= masks.size()) masks.resize(chainID+1);
+    auto nAtoms = cu.getPaddedNumAtoms();
+    if (!masks[chainID].isInitialized()) { 
+        // We need to upload the CUDA array
+        std::vector<int> maskVec(nAtoms);
+        for (int i = 0; i < nAtoms; ++i) maskVec[i] = i;
+        if (parents.size() ) {
+            assert(parents.size() == thermostatedAtoms.size());
+            for ( size_t i = 0; i < parents.size(); ++i) {
+                int atom = thermostatedAtoms[i];
+                maskVec[atom] = parents[i];
+            }
+        } else {
+            for ( const auto & atom : thermostatedAtoms) {
+                maskVec[atom] = -1L;
+            }
+        }
+        // Account for padding in the paddedNumAtoms loops
+        for(int i = thermostatedAtoms.size(); i < nAtoms; ++i){
+            maskVec[i] = i;
+        }
+        masks[chainID].initialize<int>(cu, nAtoms, "mask" + std::to_string(chainID));
+        masks[chainID].upload(maskVec);
+    }
+    if (masks[chainID].getSize() != nAtoms) {
+        throw OpenMMException("Number of atoms changed. Cannot be handled by the same Nose-Hoover thermostat.");
+    }
+    if (!kineticEnergyBuffer.isInitialized() || kineticEnergyBuffer.getSize() == 0) {
+        if(useDouble){
+            std::vector<double> one(1);
+            kineticEnergyBuffer.initialize<double>(cu, 1, "kineticEnergyBuffer");
+            kineticEnergyBuffer.upload(one);
+        } else {
+            std::vector<float> one(1);
+            kineticEnergyBuffer.initialize<float>(cu, 1, "kineticEnergyBuffer");
+            kineticEnergyBuffer.upload(one);
+        }
+    }
+    cu.clearBuffer(cu.getEnergyBuffer());
+    void *args[] = {&cu.getEnergyBuffer().getDevicePointer(),&nAtoms, &cu.getVelm().getDevicePointer(), &masks[chainID].getDevicePointer()};
+    cu.executeKernel(computeMaskedKineticEnergyKernel, args, nAtoms);
+    //taken from CudaContext::reduceEnergy(); the final kinetic energy will live in the kineticEnergy buffer
+    int bufferSize = cu.getEnergyBuffer().getSize();
+    int workGroupSize  = 512;
+    void* args2[] = {&cu.getEnergyBuffer().getDevicePointer(), &kineticEnergyBuffer.getDevicePointer(), &bufferSize, &workGroupSize};
+    cu.executeKernel(reduceEnergyKernel, args2, workGroupSize, workGroupSize, workGroupSize*cu.getEnergyBuffer().getElementSize());
+    return 0;
 }
-void CudaNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, double scaleFactor) {
+void CudaNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, double scaleFactor) {
+    // For now we assume that the mask info is valid, because computeMaskedKineticEnergy must have been
+    // called before this kernel.  If that ever ceases to be true, some sanity checks are needed here.
+    cu.setAsCurrent();
+    auto nAtoms = cu.getPaddedNumAtoms();
+    int chainID = nhc.getDefaultChainID();
+    void *args[] = {&scaleFactorBuffer.getDevicePointer(),
+                    &nAtoms, &cu.getVelm().getDevicePointer(), &masks[chainID].getDevicePointer()};
+    cu.executeKernel(scaleVelocitiesKernel, args, nAtoms);
 }
 void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {

--- a/platforms/cuda/src/kernels/noseHooverChain.cu
+++ b/platforms/cuda/src/kernels/noseHooverChain.cu
@@ -4,67 +4,101 @@
 /**
 * Propagate the Nose-Hoover chain with one yoshida-suzuki term
 */
-extern "C" __global__ void propagateNoseHooverChain(real* __restrict__ scaleFactor, real* __restrict__ chainMasses, real* __restrict__ chainForces, 
+extern "C" __global__ void propagateNoseHooverChain(mixed2* __restrict__ chainData, const mixed * __restrict__ energySum, mixed* __restrict__ scaleFactor,
+                                                    mixed* __restrict__ chainMasses, mixed* __restrict__ chainForces, 
                                                    int chainLength, int numMTS, int numDOFs, float timeStep,
-                                                    real kT, float frequency, real kineticEnergy, real2* __restrict__ chainData){
+                                                    mixed kT, float frequency){
+    mixed &scale = scaleFactor[0];
-     real &scale = scaleFactor[0];
+    scale = (mixed) 1;
-     scale = (real) 1;
+    const mixed & kineticEnergy = energySum[0];
-     for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
+    if(kineticEnergy < 1e-8) return;
-     chainMasses[0] *= numDOFs;
+    for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
-     real KE2 = 2.0f * kineticEnergy;
+    chainMasses[0] *= numDOFs;
-     real timeOverMTS = timeStep / numMTS;
+    mixed KE2 = 2.0f * kineticEnergy;
-     chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
+    mixed timeOverMTS = timeStep / numMTS;
-     for (int bead = 0; bead < chainLength - 1; ++bead) {
+    chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
-         chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+    for (int bead = 0; bead < chainLength - 1; ++bead) {
-     }
+        chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
-     for (int mts = 0; mts < numMTS; ++mts) {
+    }
-         BEGIN_YS_LOOP
+    for (int mts = 0; mts < numMTS; ++mts) {
-             real wdt = ys * timeOverMTS;
+        BEGIN_YS_LOOP
-             chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+            mixed wdt = ys * timeOverMTS;
-             for (int bead = chainLength - 2; bead >= 0; --bead) {
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
-                 real aa = exp(-0.125f * wdt * chainData[bead + 1].y);
+            for (int bead = chainLength - 2; bead >= 0; --bead) {
-                 chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
+                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
-             }
+                chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
-             // update particle velocities
+            }
-             real aa = exp(-0.5f * wdt * chainData[0].y);
+            // update particle velocities
-             scale *= aa;
+            mixed aa = EXP(-0.5f * wdt * chainData[0].y);
-             // update the thermostat positions
+            scale *= aa;
-             for (int bead = 0; bead < chainLength; ++bead) {
+            // update the thermostat positions
-                 chainData[bead].x += 0.5f * chainData[bead].y * wdt;
+            for (int bead = 0; bead < chainLength; ++bead) {
-             }
+                chainData[bead].x += 0.5f * chainData[bead].y * wdt;
-             // update the forces
+            }
-             chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
+            // update the forces
-             // update thermostat velocities
+            chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
-             for (int bead = 0; bead < chainLength - 1; ++bead) {
+            // update thermostat velocities
-                 real aa = exp(-0.125f * wdt * chainData[bead + 1].y);
+            for (int bead = 0; bead < chainLength - 1; ++bead) {
-                 chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
+                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
-                 chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+                chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
-             }
+                chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
-             chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+            }
-         END_YS_LOOP
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
-     } // MTS loop
+        END_YS_LOOP
-     //printf("SCALE %f\n", scale);
+    } // MTS loop
 }
 /**
 * Compute total (potential + kinetic) energy of the Nose-Hoover beads
 */
-extern "C" __global__ void computeHeatBathEnergy(real* __restrict__ heatBathEnergy, int chainLength, int numDOFs, 
+extern "C" __global__ void computeHeatBathEnergy(mixed* __restrict__ heatBathEnergy, int chainLength, int numDOFs, 
-                                                 real kT, float frequency, real2* __restrict__ chainData){
+                                                 mixed kT, float frequency, const mixed2* __restrict__ chainData){
-    real &energy = heatBathEnergy[0];
+    mixed &energy = heatBathEnergy[0];
-    energy = (real) 0;
+    energy = (mixed) 0;
    for(int i = 0; i < chainLength; ++i) {
-        real prefac = i ? 1 : numDOFs;
+        mixed prefac = i ? 1 : numDOFs;
-        real mass = prefac * kT / (frequency * frequency);
+        mixed mass = prefac * kT / (frequency * frequency);
-        real velocity = chainData[i].y; 
+        mixed velocity = chainData[i].y; 
        // The kinetic energy of this bead
        energy += 0.5f * mass * velocity * velocity;
        // The potential energy of this bead
-        real position = chainData[i].x;
+        mixed position = chainData[i].x;
        energy += prefac * kT * position;
    }
 }
+extern "C" __global__ void computeMaskedKineticEnergy(mixed * __restrict__ energyBuffer, int paddedNumAtoms,
+                                                      const mixed4* __restrict__ velm, const int *__restrict__ mask){
+    mixed energy = 0;
+    //energy = 1; return;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < paddedNumAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 v = velm[index];
+        mixed mass = v.w == 0 ? 0 : 1 / v.w;
+        if (mask[index] >= 0){
+            const mixed4& vparent = velm[mask[index]];
+            mixed massp = vparent.w == 0 ? 0 : 1/vparent.w;
+            mass = (massp + mass) == 0 ? 0 : (massp * mass) / ( massp + mass );
+            v.x -= vparent.x;
+            v.y -= vparent.y;
+            v.z -= vparent.z;
+        }
+        energy += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = energy;
+}
+extern "C" __global__ void scaleVelocities(mixed * __restrict__ scaleFactor, int paddedNumAtoms,
+                                           mixed4* __restrict__ velm, const int *__restrict__ mask){
+    const mixed &scale = scaleFactor[0];
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < paddedNumAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 &v = velm[index];
+        mixed4 vparent = mask[index] >= 0 ? velm[mask[index]] : make_mixed4(0.0f, 0.0f, 0.0f, 0.0f);
+        mixed maskedScale = mask[index] == index ? 1 : scale;
+        v.x = vparent.x + maskedScale * (v.x - vparent.x);
+        v.y = vparent.y + maskedScale * (v.y - vparent.y);
+        v.z = vparent.z + maskedScale * (v.z - vparent.z);
+    }
+}
--- a/platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
+++ b/platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Andreas Krämer and Andrew C. Simmonett                            *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CudaTests.h"
+#include "TestNoseHooverThermostat.h"
+void runPlatformTests() {
+}
--- a/platforms/reference/tests/TestReferenceNoseHooverThermostat.cpp
+++ b/platforms/reference/tests/TestReferenceNoseHooverThermostat.cpp
@@ -33,4 +33,6 @@
 #include "TestNoseHooverThermostat.h"
 void runPlatformTests() {
+    testNHCPropagation();
+    testPropagateChainConsistentWithPythonReference();
 }
--- a/tests/TestNoseHooverThermostat.h
+++ b/tests/TestNoseHooverThermostat.h
@@ -265,8 +265,6 @@ void runPlatformTests();
 int main(int argc, char* argv[]) {
    try {
        initializeTests(argc, argv);
-        testNHCPropagation();
-        testPropagateChainConsistentWithPythonReference();
        testHarmonicOscillator();
        bool constrain;
        constrain = false; testDimerBox(constrain);

--- a/wrappers/python/setup.py
+++ b/wrappers/python/setup.py
@@ -184,7 +184,7 @@ def buildKeywordDictionary(major_version_num=MAJOR_VERSION_NUM,
    if not openmm_lib_path:
        reportError("Set OPENMM_LIB_PATH to point to the lib directory for OpenMM")
-    extra_compile_args=[]
+    extra_compile_args=['-std=c++11']
    extra_link_args=[]
    if platform.system() == "Windows":
        define_macros.append( ('WIN32', None) )
@@ -193,7 +193,7 @@ def buildKeywordDictionary(major_version_num=MAJOR_VERSION_NUM,
        extra_compile_args.append('/EHsc')
    else:
        if platform.system() == 'Darwin':
-            extra_compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
+            extra_compile_args += ['-std=c++11 -stdlib=libc++', '-mmacosx-version-min=10.7']
            extra_link_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7', '-Wl', '-rpath', openmm_lib_path]
            # Hard-code CC and CXX to clang, since gcc/g++ will *not* work with
            # Anaconda, despite the fact that distutils will try to use them.