Add Cuda velocity verlet implementation, correct virtual site computation

6ed75b19 · Andy Simmonett · 63a9e6c2 · 6ed75b19 · 6ed75b19 · 6ed75b19
Unverified Commit 6ed75b19 authored May 13, 2019 by Andy Simmonett
8 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -1366,6 +1366,40 @@ private:
    CUfunction kernel1, kernel2;
 };

+/*
+ * This kernel is invoked by VelocityVerletIntegrator to take one time step.
+ */
+class CudaIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
+public:
+    CudaIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) :
+                                  IntegrateVelocityVerletStepKernel(name, platform), cu(cu) { }
+    ~CudaIntegrateVelocityVerletStepKernel() {}
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the VelocityVerletIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const VelocityVerletIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the VerletIntegrator this kernel is being used for
+     */
+    void execute(ContextImpl& context, const VelocityVerletIntegrator& integrator);
+    /**
+     * Compute the kinetic energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the VelocityVerletIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const VelocityVerletIntegrator& integrator);
+private:
+    CudaContext& cu;
+    CUfunction kernel1, kernel2, kernel3;
+};
+
 /**
 * This kernel is invoked by LangevinIntegrator to take one time step.
 */
@@ -1677,6 +1711,60 @@ private:
    CUfunction kernel;
 };

+/**
+ * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
+ * and update the associated particle velocities.
+ */
+class CudaNoseHooverChainKernel : public NoseHooverChainKernel {
+public:
+    CudaNoseHooverChainKernel(std::string name, const Platform& platform, CudaContext& cu) : NoseHooverChainKernel(name, platform), cu(cu) {
+    }
+    ~CudaNoseHooverChainKernel() {}
+    /**
+     * Initialize the kernel.
+     */
+    virtual void initialize();
+    /**
+     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
+     * 
+     * @param context  the context in which to execute this kernel
+     * @param noseHooverChain the object describing the chain to be propagated.
+     * @param kineticEnergy the kineticEnergy of the particles being thermostated by this chain.
+     * @param timeStep the time step used by the integrator.
+     * @return the velocity scale factor to apply to the particles associated with this heat bath.
+     */
+    virtual double propagateChain(ContextImpl& context, const NoseHooverChain &nhc, double kineticEnergy, double timeStep);
+    /**
+     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @return the total heat bath energy.
+     */
+    virtual double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
+    /**
+     * Execute the kernel that computes the kinetic energy for a subset of atoms,
+     * or the relative kinetic energy of Drude particles with respect to their parent atoms
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     */
+     virtual double computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain);
+
+    /**
+     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
+     *
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param scaleFactor the multiplicative factor by which velocities are scaled.
+     */
+    virtual void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, double scaleFactor);
+
+private:
+    CudaContext& cu;
+    CUfunction kernel;
+};
+
 /**
 * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
 */

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -130,6 +130,10 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaIntegrateCustomStepKernel(name, platform, cu);
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CudaApplyAndersenThermostatKernel(name, platform, cu);
+    if (name == NoseHooverChainKernel::Name())
+        return new CudaNoseHooverChainKernel(name, platform, cu);
+    if (name == IntegrateVelocityVerletStepKernel::Name())
+        return new CudaIntegrateVelocityVerletStepKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
    if (name == RemoveCMMotionKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -37,6 +37,7 @@
 #include "openmm/internal/CustomManyParticleForceImpl.h"
 #include "openmm/internal/CustomNonbondedForceImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
+#include "openmm/internal/NoseHooverChainImpl.h"
 #include "openmm/internal/OSRngSeed.h"
 #include "CudaBondedUtilities.h"
 #include "CudaExpressionUtilities.h"
@@ -7075,6 +7076,67 @@ double CudaIntegrateVerletStepKernel::computeKineticEnergy(ContextImpl& context,
    return cu.getIntegrationUtilities().computeKineticEnergy(0.5*integrator.getStepSize());
 }

+void CudaIntegrateVelocityVerletStepKernel::initialize(const System& system, const VelocityVerletIntegrator& integrator) {
+    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
+    map<string, string> defines;
+    CUmodule module = cu.createModule(CudaKernelSources::velocityVerlet, defines, "");
+    kernel1 = cu.getKernel(module, "integrateVelocityVerletPart1");
+    kernel2 = cu.getKernel(module, "integrateVelocityVerletPart2");
+    kernel3 = cu.getKernel(module, "integrateVelocityVerletPart3");
+}
+
+void CudaIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, const VelocityVerletIntegrator& integrator) {
+    cu.setAsCurrent();
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
+    double dt = integrator.getStepSize();
+    cu.getIntegrationUtilities().setNextStepSize(dt);
+
+    //// Call the first integration kernel.
+
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
+            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
+    cu.executeKernel(kernel1, args1, numAtoms, 128);
+
+    //// Apply constraints.
+
+    integration.applyConstraints(integrator.getConstraintTolerance());
+
+    //// Call the second integration kernel.
+
+    void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
+            &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
+    cu.executeKernel(kernel2, args2, numAtoms, 128);
+
+    integration.computeVirtualSites();
+
+    //// Update forces
+    context.calcForcesAndEnergy(true, false);
+
+    //// Call the third integration kernel.
+
+    void* args3[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
+            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
+    cu.executeKernel(kernel3, args3, numAtoms, 128);
+
+    // TODO: Figure out if this is really needed.  The constraint velocities are accounted for
+    // in a finite difference sense in the step 3 kernel, when the velocities are updated.
+    integration.applyVelocityConstraints(integrator.getConstraintTolerance());
+
+    //// Update the time and step count.
+
+    cu.setTime(cu.getTime()+dt);
+    cu.setStepCount(cu.getStepCount()+1);
+    cu.reorderAtoms();
+}
+
+double CudaIntegrateVelocityVerletStepKernel::computeKineticEnergy(ContextImpl& context, const VelocityVerletIntegrator& integrator) {
+    return cu.getIntegrationUtilities().computeKineticEnergy(0);
+}
+
 void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
    cu.getPlatformData().initializeContexts(system);
    cu.setAsCurrent();
@@ -8287,6 +8349,24 @@ void CudaApplyAndersenThermostatKernel::execute(ContextImpl& context) {
    cu.executeKernel(kernel, args, cu.getNumAtoms());
 }

+void CudaNoseHooverChainKernel::initialize() {
+}
+
+double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, double kineticEnergy, double timeStep) {
+    return 1;
+}
+
+double CudaNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
+    return 1;
+}
+
+double CudaNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain) {
+    return 1;
+}
+
+void CudaNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, double scaleFactor) {
+}
+
 void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
    cu.setAsCurrent();
    savedPositions.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4), "savedPositions");

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -96,12 +96,14 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVelocityVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
+    registerKernelFactory(NoseHooverChainKernel::Name(), factory);
    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
    platformProperties.push_back(CudaDeviceIndex());

--- a/platforms/cuda/src/kernels/velocityVerlet.cu
+++ b/platforms/cuda/src/kernels/velocityVerlet.cu
+/**
+ * Perform the first step of Velocity Verlet integration.
+ */
+
+extern "C" __global__ void integrateVelocityVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
+        const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
+    const mixed2 stepSize = dt[0];
+    const mixed dtPos = stepSize.y;
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            velocity.x += scale*force[index]*velocity.w;
+            velocity.y += scale*force[index+paddedNumAtoms]*velocity.w;
+            velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w;
+            pos.x = velocity.x*dtPos;
+            pos.y = velocity.y*dtPos;
+            pos.z = velocity.z*dtPos;
+            posDelta[index] = pos;
+            velm[index] = velocity;
+        }
+    }
+}
+
+/**
+ * Perform the second step of Velocity Verlet integration.
+ */
+
+extern "C" __global__ void integrateVelocityVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
+        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
+    mixed2 stepSize = dt[0];
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    for (; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            mixed4 delta = posDelta[index];
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+        }
+    }
+}
+
+/**
+ * Perform the third step of Velocity Verlet integration.
+ */
+
+extern "C" __global__ void integrateVelocityVerletPart3(int numAtoms, int paddedNumAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
+        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm,  const long long* __restrict__ force, const mixed4* __restrict__ posDelta) {
+    mixed2 stepSize = dt[0];
+#if __CUDA_ARCH__ >= 130
+    double oneOverDt = 1.0/stepSize.y;
+#else
+    float oneOverDt = 1.0f/stepSize.y;
+    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
+#endif
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    for (; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 deltaXconstrained = posDelta[index];
+            velocity.x += scale*force[index]*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
+            velocity.y += scale*force[index+paddedNumAtoms]*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
+            velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
+#if __CUDA_ARCH__ < 130
+            velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
+            velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
+            velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
+#endif
+            velm[index] = velocity;
+        }
+    }
+}
+
--- a/platforms/cuda/tests/TestCudaVelocityVerletIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaVelocityVerletIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Andreas Krämer and Andrew C. Simmmonett                           *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaTests.h"
+#include "TestVelocityVerletIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/reference/src/SimTKReference/ReferenceVelocityVerletDynamics.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceVelocityVerletDynamics.cpp
@@ -109,6 +109,8 @@ void ReferenceVelocityVerletDynamics::update(OpenMM::ContextImpl &context, const
   if (referenceConstraintAlgorithm)
      referenceConstraintAlgorithm->apply(xPrime, atomCoordinates, inverseMasses, tolerance);

+   ReferenceVirtualSites::computePositions(system, atomCoordinates);
+
   context.calcForcesAndEnergy(true, false);

   for (int i = 0; i < numberOfAtoms; ++i) {
@@ -129,6 +131,5 @@ void ReferenceVelocityVerletDynamics::update(OpenMM::ContextImpl &context, const
   if (referenceConstraintAlgorithm)
      referenceConstraintAlgorithm->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);

-   ReferenceVirtualSites::computePositions(system, atomCoordinates);
   incrementTimeStep();
 }
--- a/platforms/reference/tests/TestReferenceVelocityVerletIntegrator.cpp
+++ b/platforms/reference/tests/TestReferenceVelocityVerletIntegrator.cpp
@@ -30,7 +30,7 @@
 * -------------------------------------------------------------------------- */

 #include "ReferenceTests.h"
-#include "TestVerletIntegrator.h"
+#include "TestVelocityVerletIntegrator.h"

 void runPlatformTests() {
 }