CUDA implementation of BAOABLangevinIntegrator

a4bf00ee · Peter Eastman · ad7acc66 · a4bf00ee · a4bf00ee · a4bf00ee
Commit a4bf00ee authored Oct 08, 2019 by Peter Eastman
10 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -1401,6 +1401,45 @@ private:
    CUfunction kernel1, kernel2;
 };

+/**
+ * This kernel is invoked by BAOABLangevinIntegrator to take one time step.
+ */
+class CudaIntegrateBAOABStepKernel : public IntegrateBAOABStepKernel {
+public:
+    CudaIntegrateBAOABStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateBAOABStepKernel(name, platform), cu(cu) {
+    }
+    /**
+     * Initialize the kernel, setting up the particle masses.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the BAOABLangevinIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const BAOABLangevinIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
+     * @param forcesAreValid if the context has been modified since the last time step, this will be
+     *                       false to show that cached forces are invalid and must be recalculated.
+     *                       On exit, this should specify whether the cached forces are valid at the
+     *                       end of the step.
+     */
+    void execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid);
+    /**
+     * Compute the kinetic energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator);
+private:
+    CudaContext& cu;
+    double prevTemp, prevFriction, prevStepSize;
+    CudaArray params, oldDelta;
+    CUfunction kernel1, kernel2, kernel3;
+};
+
 /**
 * This kernel is invoked by BrownianIntegrator to take one time step.
 */

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -120,6 +120,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaIntegrateVerletStepKernel(name, platform, cu);
    if (name == IntegrateLangevinStepKernel::Name())
        return new CudaIntegrateLangevinStepKernel(name, platform, cu);
+    if (name == IntegrateBAOABStepKernel::Name())
+        return new CudaIntegrateBAOABStepKernel(name, platform, cu);
    if (name == IntegrateBrownianStepKernel::Name())
        return new CudaIntegrateBrownianStepKernel(name, platform, cu);
    if (name == IntegrateVariableVerletStepKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -7143,6 +7143,90 @@ double CudaIntegrateLangevinStepKernel::computeKineticEnergy(ContextImpl& contex
    return cu.getIntegrationUtilities().computeKineticEnergy(0.5*integrator.getStepSize());
 }

+void CudaIntegrateBAOABStepKernel::initialize(const System& system, const BAOABLangevinIntegrator& integrator) {
+    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
+    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
+    map<string, string> defines;
+    CUmodule module = cu.createModule(CudaKernelSources::baoab, defines, "");
+    kernel1 = cu.getKernel(module, "integrateBAOABPart1");
+    kernel2 = cu.getKernel(module, "integrateBAOABPart2");
+    kernel3 = cu.getKernel(module, "integrateBAOABPart3");
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+        params.initialize<double>(cu, 3, "baoabParams");
+        oldDelta.initialize<double4>(cu, cu.getPaddedNumAtoms(), "oldDelta");
+    }
+    else {
+        params.initialize<float>(cu, 3, "baoabParams");
+        oldDelta.initialize<float4>(cu, cu.getPaddedNumAtoms(), "oldDelta");
+    }
+    prevStepSize = -1.0;
+}
+
+void CudaIntegrateBAOABStepKernel::execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid) {
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
+    if (!forcesAreValid) {
+        context.calcForcesAndEnergy(true, false);
+        forcesAreValid = true;
+    }
+    double temperature = integrator.getTemperature();
+    double friction = integrator.getFriction();
+    double stepSize = integrator.getStepSize();
+    cu.getIntegrationUtilities().setNextStepSize(stepSize);
+    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+        // Calculate the integration parameters.
+
+        double kT = BOLTZ*temperature;
+        double vscale = exp(-stepSize*friction);
+        double noisescale = sqrt(kT*(1-vscale*vscale));
+        vector<double> p(params.getSize());
+        p[0] = vscale;
+        p[1] = noisescale;
+        params.upload(p, true);
+        prevTemp = temperature;
+        prevFriction = friction;
+        prevStepSize = stepSize;
+    }
+
+    // Perform the integrator.
+
+    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+            &oldDelta.getDevicePointer(), &integration.getStepSize().getDevicePointer()};
+    cu.executeKernel(kernel1, args1, numAtoms, 128);
+    integration.applyConstraints(integrator.getConstraintTolerance());
+    void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+            &oldDelta.getDevicePointer(), &params.getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
+    cu.executeKernel(kernel2, args2, numAtoms, 128);
+    integration.applyConstraints(integrator.getConstraintTolerance());
+    context.calcForcesAndEnergy(true, false);
+    void* args3[] = {&numAtoms, &paddedNumAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(),
+            &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+            &oldDelta.getDevicePointer(), &integration.getStepSize().getDevicePointer()};
+    cu.executeKernel(kernel3, args3, numAtoms, 128);
+    integration.applyVelocityConstraints(integrator.getConstraintTolerance());
+    integration.computeVirtualSites();
+
+    // Update the time and step count.
+
+    cu.setTime(cu.getTime()+stepSize);
+    cu.setStepCount(cu.getStepCount()+1);
+    cu.reorderAtoms();
+    
+    // Reduce UI lag.
+    
+#ifdef WIN32
+    cu.getQueue().flush();
+#endif
+}
+
+double CudaIntegrateBAOABStepKernel::computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator) {
+    return cu.getIntegrationUtilities().computeKineticEnergy(0.0);
+}
+
 void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
    cu.getPlatformData().initializeContexts(system);
    cu.setAsCurrent();

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -97,6 +97,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateBAOABStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);

--- a/platforms/cuda/src/kernels/baoab.cu
+++ b/platforms/cuda/src/kernels/baoab.cu
+enum {VelScale, NoiseScale};
+
+/**
+ * Perform the first step of BAOAB integration.
+ */
+
+extern "C" __global__ void integrateBAOABPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
+        mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) {
+    mixed halfdt = 0.5*dt[0].y;
+    mixed fscale = halfdt/(mixed) 0x100000000;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            velocity.x += fscale*velocity.w*force[index];
+            velocity.y += fscale*velocity.w*force[index+paddedNumAtoms];
+            velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2];
+            velm[index] = velocity;
+            mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
+            posDelta[index] = delta;
+            oldDelta[index] = delta;
+        }
+    }
+}
+
+/**
+ * Perform the second step of BAOAB integration.
+ */
+
+extern "C" __global__ void integrateBAOABPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, mixed4* __restrict__ posDelta,
+        mixed4* __restrict__ oldDelta, const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
+    mixed vscale = paramBuffer[VelScale];
+    mixed noisescale = paramBuffer[NoiseScale];
+    mixed halfdt = 0.5*dt[0].y;
+    mixed invHalfdt = 1/halfdt;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    randomIndex += index;
+    while (index < numAtoms) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 delta = posDelta[index];
+            mixed sqrtInvMass = SQRT(velocity.w);
+            velocity.x += (delta.x-oldDelta[index].x)*invHalfdt;
+            velocity.y += (delta.y-oldDelta[index].y)*invHalfdt;
+            velocity.z += (delta.z-oldDelta[index].z)*invHalfdt;
+            velocity.x = vscale*velocity.x + noisescale*sqrtInvMass*random[randomIndex].x;
+            velocity.y = vscale*velocity.y + noisescale*sqrtInvMass*random[randomIndex].y;
+            velocity.z = vscale*velocity.z + noisescale*sqrtInvMass*random[randomIndex].z;
+            velm[index] = velocity;
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+            delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
+            posDelta[index] = delta;
+            oldDelta[index] = delta;
+        }
+        randomIndex += blockDim.x*gridDim.x;
+        index += blockDim.x*gridDim.x;
+    }
+}
+
+/**
+ * Perform the third step of BAOAB integration.
+ */
+
+extern "C" __global__ void integrateBAOABPart3(int numAtoms, int paddedNumAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm,
+        const long long* __restrict__ force, mixed4* __restrict__ posDelta, mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) {
+    mixed halfdt = 0.5*dt[0].y;
+    mixed invHalfdt = 1/halfdt;
+    mixed fscale = halfdt/(mixed) 0x100000000;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 delta = posDelta[index];
+            velocity.x += (delta.x-oldDelta[index].x)*invHalfdt + fscale*velocity.w*force[index];
+            velocity.y += (delta.y-oldDelta[index].y)*invHalfdt + fscale*velocity.w*force[index+paddedNumAtoms];
+            velocity.z += (delta.z-oldDelta[index].z)*invHalfdt + fscale*velocity.w*force[index+paddedNumAtoms*2];
+            velm[index] = velocity;
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+        }
+    }
+}
--- a/platforms/cuda/tests/TestCudaBAOABLangevinIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaBAOABLangevinIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaTests.h"
+#include "TestBAOABLangevinIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -1384,7 +1384,7 @@ private:
 };

 /**
- * This kernel is invoked by LangevinIntegrator to take one time step.
+ * This kernel is invoked by BAOABLangevinIntegrator to take one time step.
 */
 class OpenCLIntegrateBAOABStepKernel : public IntegrateBAOABStepKernel {
 public:

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -7484,11 +7484,11 @@ void OpenCLIntegrateBAOABStepKernel::initialize(const System& system, const BAOA
    kernel2 = cl::Kernel(program, "integrateBAOABPart2");
    kernel3 = cl::Kernel(program, "integrateBAOABPart3");
    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
-        params.initialize<cl_double>(cl, 3, "baoabParams");
+        params.initialize<cl_double>(cl, 2, "baoabParams");
        oldDelta.initialize<mm_double4>(cl, cl.getPaddedNumAtoms(), "oldDelta");
    }
    else {
-        params.initialize<cl_float>(cl, 3, "baoabParams");
+        params.initialize<cl_float>(cl, 2, "baoabParams");
        oldDelta.initialize<mm_float4>(cl, cl.getPaddedNumAtoms(), "oldDelta");
    }
    prevStepSize = -1.0;
@@ -7533,12 +7533,10 @@ void OpenCLIntegrateBAOABStepKernel::execute(ContextImpl& context, const BAOABLa

        double kT = BOLTZ*temperature;
        double vscale = exp(-stepSize*friction);
-        double fscale = (friction == 0 ? stepSize : (1-vscale)/friction);
        double noisescale = sqrt(kT*(1-vscale*vscale));
        vector<cl_double> p(params.getSize());
        p[0] = vscale;
-        p[1] = fscale;
-        p[2] = noisescale;
+        p[1] = noisescale;
        params.upload(p, true, true);
        prevTemp = temperature;
        prevFriction = friction;

--- a/platforms/opencl/src/kernels/baoab.cl
+++ b/platforms/opencl/src/kernels/baoab.cl
-enum {VelScale, ForceScale, NoiseScale, MaxParams};
+enum {VelScale, NoiseScale};

 /**
 * Perform the first step of BAOAB integration.
@@ -28,7 +28,6 @@ __kernel void integrateBAOABPart1(__global mixed4* restrict velm, __global const
 __kernel void integrateBAOABPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global mixed4* restrict posDelta,
        __global mixed4* restrict oldDelta, __global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
    mixed vscale = paramBuffer[VelScale];
-    mixed fscale = paramBuffer[ForceScale];
    mixed noisescale = paramBuffer[NoiseScale];
    mixed halfdt = 0.5*dt[0].y;
    mixed invHalfdt = 1/halfdt;
@@ -38,7 +37,7 @@ __kernel void integrateBAOABPart2(__global real4* restrict posq, __global real4*
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
            mixed4 delta = posDelta[index];
-            mixed sqrtInvMass = sqrt(velocity.w);
+            mixed sqrtInvMass = SQRT(velocity.w);
            velocity.xyz += (delta.xyz-oldDelta[index].xyz)*invHalfdt;
            velocity.x = vscale*velocity.x + noisescale*sqrtInvMass*random[randomIndex].x;
            velocity.y = vscale*velocity.y + noisescale*sqrtInvMass*random[randomIndex].y;

--- a/platforms/reference/include/ReferenceKernels.h
+++ b/platforms/reference/include/ReferenceKernels.h
@@ -1172,7 +1172,7 @@ private:
 };

 /**
- * This kernel is invoked by LangevinIntegrator to take one time step.
+ * This kernel is invoked by BAOABLangevinIntegrator to take one time step.
 */
 class ReferenceIntegrateBAOABStepKernel : public IntegrateBAOABStepKernel {
 public: