Created CUDA implementation of VariableVerletIntegrator

a1b2fd30 · Peter Eastman · 5b5ec4c3 · a1b2fd30 · a1b2fd30 · a1b2fd30
Commit a1b2fd30 authored Jun 26, 2009 by Peter Eastman
13 changed files
--- a/openmmapi/include/openmm/VariableVerletIntegrator.h
+++ b/openmmapi/include/openmm/VariableVerletIntegrator.h
@@ -40,15 +40,15 @@ namespace OpenMM {
 /**
 * This is an error contolled, variable time step Integrator that simulates a System using the
- * velocity Verlet algorithm.  It compares the result of the Verlet integrator to that of an
+ * leap-frog Verlet algorithm.  It compares the result of the Verlet integrator to that of an
 * explicit Euler integrator, takes the difference between the two as a measure of the integration
 * error in each time step, and continuously adjusts the step size to keep the error below a
- * specified tolerance.  This allows it to take larger steps on average than a fixed step size
+ * specified tolerance.  This both improves the stability of the integrator and allows it to take
- * integrator, while still maintaining comparable accuracy and stability.
+ * larger steps on average, while still maintaining comparable accuracy to a fixed step size integrator.
 *
 * It is best not to think of the error tolerance as having any absolute meaning.  It is just an
 * adjustable parameter that affects the step size and integration accuracy.  You
- * should try different values to find the largest onethat produces a trajectory sufficiently
+ * should try different values to find the largest one that produces a trajectory sufficiently
 * accurate for your purposes.  0.001 is often a good starting point.
 *
 * Unlike a fixed step size Verlet integrator, variable step size Verlet is not symplectic.  This

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -55,6 +55,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaIntegrateLangevinStepKernel(name, platform, data);
    if (name == IntegrateBrownianStepKernel::Name())
        return new CudaIntegrateBrownianStepKernel(name, platform, data);
+    if (name == IntegrateVariableVerletStepKernel::Name())
+        return new CudaIntegrateVariableVerletStepKernel(name, platform, data);
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CudaApplyAndersenThermostatKernel(name, platform, data);
    if (name == CalcKineticEnergyKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -41,9 +41,9 @@ using namespace std;
 static void calcForces(OpenMMContextImpl& context, CudaPlatform::PlatformData& data) {
    _gpuContext* gpu = data.gpu;
-    if (data.nonbondedMethod != NO_CUTOFF && data.stepCount%100 == 0)
+    if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0)
        gpuReorderAtoms(gpu);
-    data.stepCount++;
+    data.computeForceCount++;
    kClearForces(gpu);
    if (gpu->bIncludeGBSA) {
        gpu->bRecalculateBornRadii = true;
@@ -443,22 +443,20 @@ void CudaIntegrateVerletStepKernel::execute(OpenMMContextImpl& context, const Ve
    if (stepSize != prevStepSize) {
        // Initialize the GPU parameters.
-        gpuSetVerletIntegrationParameters(gpu, (float) stepSize);
+        gpuSetVerletIntegrationParameters(gpu, (float) stepSize, 0.0f);
        gpuSetConstants(gpu);
-        kGenerateRandoms(gpu);
        prevStepSize = stepSize;
    }
    kVerletUpdatePart1(gpu);
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
    kApplyFirstCCMA(gpu);
-    if (data.removeCM) {
+    if (data.removeCM)
-        int step = (int) (context.getTime()/stepSize);
+        if (data.stepCount%data.cmMotionFrequency == 0)
-        if (step%data.cmMotionFrequency == 0)
            gpu->bCalculateCM = true;
-    }
    kVerletUpdatePart2(gpu);
    data.time += stepSize;
+    data.stepCount++;
 }
 CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
@@ -492,16 +490,15 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
    kApplyFirstCCMA(gpu);
-    if (data.removeCM) {
+    if (data.removeCM)
-        int step = (int) (context.getTime()/stepSize);
+        if (data.stepCount%data.cmMotionFrequency == 0)
-        if (step%data.cmMotionFrequency == 0)
            gpu->bCalculateCM = true;
-    }
    kLangevinUpdatePart2(gpu);
    kApplySecondShake(gpu);
    kApplySecondSettle(gpu);
    kApplySecondCCMA(gpu);
    data.time += stepSize;
+    data.stepCount++;
 }
 CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
@@ -535,13 +532,44 @@ void CudaIntegrateBrownianStepKernel::execute(OpenMMContextImpl& context, const
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
    kApplyFirstCCMA(gpu);
-    if (data.removeCM) {
+    if (data.removeCM)
-        int step = (int) (context.getTime()/stepSize);
+        if (data.stepCount%data.cmMotionFrequency == 0)
-        if (step%data.cmMotionFrequency == 0)
            gpu->bCalculateCM = true;
-    }
    kBrownianUpdatePart2(gpu);
    data.time += stepSize;
+    data.stepCount++;
+}
+CudaIntegrateVariableVerletStepKernel::~CudaIntegrateVariableVerletStepKernel() {
+}
+void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
+    initializeIntegration(system, data, integrator);
+    prevErrorTol = -1.0;
+}
+void CudaIntegrateVariableVerletStepKernel::execute(OpenMMContextImpl& context, const VariableVerletIntegrator& integrator) {
+    _gpuContext* gpu = data.gpu;
+    double errorTol = integrator.getErrorTolerance();
+    if (errorTol != prevErrorTol) {
+        // Initialize the GPU parameters.
+        gpuSetVerletIntegrationParameters(gpu, 0.0f, (float) errorTol);
+        gpuSetConstants(gpu);
+        prevErrorTol = errorTol;
+    }
+    kSelectVerletStepSize(gpu);
+    kVerletUpdatePart1(gpu);
+    kApplyFirstShake(gpu);
+    kApplyFirstSettle(gpu);
+    kApplyFirstCCMA(gpu);
+    if (data.removeCM)
+        if (data.stepCount%data.cmMotionFrequency == 0)
+            gpu->bCalculateCM = true;
+    kVerletUpdatePart2(gpu);
+    gpu->psStepSize->Download();
+    data.time += (*gpu->psStepSize)[0].y;
+    data.stepCount++;
 }
 CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() {

--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -373,6 +373,33 @@ private:
    double prevTemp, prevFriction, prevStepSize;
 };
+/**
+ * This kernel is invoked by VariableVerletIntegrator to take one time step.
+ */
+class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
+public:
+    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVariableVerletStepKernel(name, platform), data(data) {
+    }
+    ~CudaIntegrateVariableVerletStepKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param integrator the VerletIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const VariableVerletIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param integrator the VerletIntegrator this kernel is being used for
+     */
+    void execute(OpenMMContextImpl& context, const VariableVerletIntegrator& integrator);
+private:
+    CudaPlatform::PlatformData& data;
+    double prevErrorTol;
+};
 /**
 * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.
 */

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -52,6 +52,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
    registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);

--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -50,6 +50,7 @@ extern void kLangevinUpdatePart1(gpuContext gpu);
 extern void kLangevinUpdatePart2(gpuContext gpu);
 extern void kVerletUpdatePart1(gpuContext gpu);
 extern void kVerletUpdatePart2(gpuContext gpu);
+extern void kSelectVerletStepSize(gpuContext gpu);
 extern void kBrownianUpdatePart1(gpuContext gpu);
 extern void kBrownianUpdatePart2(gpuContext gpu);

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -268,6 +268,8 @@ struct cudaGmxSimulation {
    unsigned int*   pWorkUnit;                      // Pointer to work units
    unsigned int*   pInteractingWorkUnit;           // Pointer to work units that have interactions
    unsigned int*   pInteractionFlag;               // Flags for which work units have interactions
+    float2*         pStepSize;                      // The size of the previous and current time steps
+    float           errorTol;                       // Error tolerance for selecting the step size
    size_t*         pInteractionCount;              // A count of the number of work units which have interactions
    unsigned int    nonbond_workBlock;              // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
    unsigned int    bornForce2_workBlock;           // Number of work units running second half of Born Forces calculation

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -728,7 +728,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    // Compute the constraint coupling matrix
    vector<vector<int> > atomAngles(gpu->natoms);
-    for (int i = 0; i < (int)gpu->sim.bond_angles; i++)
+    for (int i = 0; i < gpu->sim.bond_angles; i++)
        atomAngles[(*gpu->psBondAngleID1)[i].y].push_back(i);
    vector<vector<pair<int, double> > > matrix(numCCMA);
    if (numCCMA > 0) {
@@ -902,7 +902,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
        (*psCcmaReducedMass)[i] = 0.5f/(invMass1[c]+invMass2[c]);
        for (unsigned int j = 0; j < matrix[index].size(); j++) {
            (*psConstraintMatrixColumn)[i+j*numCCMA] = matrix[index][j].first;
-            (*psConstraintMatrixValue)[i+j*numCCMA] = (float)matrix[index][j].second;
+            (*psConstraintMatrixValue)[i+j*numCCMA] = matrix[index][j].second;
        }
        (*psConstraintMatrixColumn)[i+matrix[index].size()*numCCMA] = numCCMA;
    }
@@ -1010,6 +1010,10 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.pCosSinSum                 = gpu->psEwaldCosSinSum->_pDevStream[0];
    gpu->psObcData                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData");
    gpu->sim.pObcData                   = gpu->psObcData->_pDevStream[0];
+    gpu->psStepSize                     = new CUDAStream<float2>(1, 1, "StepSize");
+    gpu->sim.pStepSize                  = gpu->psStepSize->_pDevStream[0];
+    (*gpu->psStepSize)[0] = make_float2(0.0f, 0.0f);
+    gpu->psStepSize->Upload();
    gpu->pAtomSymbol                    = new unsigned char[gpu->natoms];
    gpu->psAtomIndex                    = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1, "AtomIndex");
    gpu->sim.pAtomIndex                 = gpu->psAtomIndex->_pDevStream[0];
@@ -1359,9 +1363,13 @@ void gpuSetIntegrationParameters(gpuContext gpu, float tau, float deltaT, float
 }
 extern "C"
-void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT) {
+void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT, float errorTol) {
    gpu->sim.deltaT                 = deltaT;
    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
+    gpu->sim.errorTol               = errorTol;
+    gpu->psStepSize->Download();
+    (*gpu->psStepSize)[0].y = deltaT;
+    gpu->psStepSize->Upload();
 }
 extern "C"

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -119,6 +119,7 @@ struct _gpuContext {
    CUDAStream<unsigned int>* psInteractingWorkUnit;
    CUDAStream<unsigned int>* psInteractionFlag;
    CUDAStream<size_t>* psInteractionCount;
+    CUDAStream<float2>* psStepSize;         // The size of the previous and current time steps
    CUDAStream<float4>* psRandom4;          // Pointer to sets of 4 random numbers for MD integration
    CUDAStream<float2>* psRandom2;          // Pointer to sets of 2 random numbers for MD integration
    CUDAStream<uint4>* psRandomSeed;        // Pointer to each random seed
@@ -211,7 +212,7 @@ extern "C"
 void gpuSetIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
 extern "C"
-void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT);
+void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT, float errorTol);
 extern "C"
 void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);

--- a/platforms/cuda/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda/src/kernels/kVerletUpdate.cu
@@ -30,7 +30,6 @@
 #include <cstdlib>
 #include <string>
 #include <iostream>
-//#include <fstream>
 using namespace std;
 #include "gputypes.h"
@@ -90,3 +89,47 @@ void kVerletUpdatePart2(gpuContext gpu)
    }
 }
+__global__ void kSelectVerletStepSize_kernel()
+{
+    // Calculate the error.
+    extern __shared__ float error[];
+    error[threadIdx.x] = 0.0f;
+    unsigned int pos = threadIdx.x;
+    while (pos < cSim.atoms)
+    {
+        float4 force  = cSim.pForce4[pos];
+        float invMass = cSim.pVelm4[pos].w;
+        error[threadIdx.x] += (force.x*force.x + force.y*force.y + force.z*force.z)*invMass;
+        pos += blockDim.x * gridDim.x;
+    }
+    // Sum the errors from all threads.
+    for (int offset = 1; offset < cSim.atoms; offset *= 2)
+    {
+        if (threadIdx.x+offset < cSim.atoms && threadIdx.x%(2*offset) == 0)
+            error[threadIdx.x] += error[threadIdx.x+offset];
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float totalError = sqrt(error[0]/(cSim.atoms*3));
+        float newStepSize = sqrt(cSim.errorTol/totalError);
+        float oldStepSize = cSim.pStepSize[0].y;
+        if (oldStepSize > 0.0f)
+            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
+        if (newStepSize > oldStepSize && newStepSize < 1.2f*oldStepSize)
+            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
+        cSim.pStepSize[0].y = newStepSize;
+    }
+}
+void kSelectVerletStepSize(gpuContext gpu)
+{
+//    printf("kSelectVerletStepSize\n");
+    kSelectVerletStepSize_kernel<<<1, gpu->sim.update_threads_per_block, sizeof(float)*gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kSelectVerletStepSize");
+}
--- a/platforms/cuda/src/kernels/kVerletUpdate.h
+++ b/platforms/cuda/src/kernels/kVerletUpdate.h
@@ -36,6 +36,15 @@ __global__ void kVerletUpdatePart1CM_kernel()
 __global__ void kVerletUpdatePart1_kernel()
 #endif
 {
+    // Load the step size to take.
+    __shared__ float dtPos;
+    __shared__ float dtVel;
+    if (threadIdx.x == 0)
+    {
+        float2 stepSize = cSim.pStepSize[0];
+        dtPos = stepSize.y;
+        dtVel = 0.5f*(stepSize.x+stepSize.y);
+    }
    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
 #ifdef REMOVE_CM
    extern __shared__ float3 sCM[];
@@ -72,13 +81,15 @@ __global__ void kVerletUpdatePart1_kernel()
        offset *= 2;
        __syncthreads();
    }
+#else
+    __syncthreads();
 #endif
    while (pos < cSim.atoms)
    {
        float4 apos             = cSim.pPosq[pos];
        float4 velocity         = cSim.pVelm4[pos];
        float4 force            = cSim.pForce4[pos];
-        float dtOverMass        = cSim.deltaT*velocity.w;
+        float dtOverMass        = dtVel*velocity.w;
        cSim.pOldPosq[pos]      = apos;
        velocity.x             += dtOverMass*force.x;
@@ -90,9 +101,9 @@ __global__ void kVerletUpdatePart1_kernel()
        velocity.z             -= sCM[0].z;
 #endif
-        apos.x                  = velocity.x*cSim.deltaT;
+        apos.x                  = velocity.x*dtPos;
-        apos.y                  = velocity.y*cSim.deltaT;
+        apos.y                  = velocity.y*dtPos;
-        apos.z                  = velocity.z*cSim.deltaT;
+        apos.z                  = velocity.z*dtPos;
        cSim.pPosqP[pos]        = apos;
        cSim.pVelm4[pos]        = velocity;
@@ -106,7 +117,17 @@ __global__ void kVerletUpdatePart2CM_kernel()
 __global__ void kVerletUpdatePart2_kernel()
 #endif
 {
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    // Load the step size to take.
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    __shared__ float oneOverDeltaT;
+    if (threadIdx.x == 0)
+    {
+        float dt = cSim.pStepSize[0].y;
+        oneOverDeltaT = 1.0f/dt;
+        if (pos == 0)
+            cSim.pStepSize[0].x = dt;
+    }
+    __syncthreads();
 #ifdef REMOVE_CM
    extern __shared__ float3 sCM[];
    float3 CM                   = {0.0f, 0.0f, 0.0f};
@@ -118,9 +139,9 @@ __global__ void kVerletUpdatePart2_kernel()
        float4 apos             = cSim.pPosq[pos];
        float4 xPrime           = cSim.pPosqP[pos];
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
+        velocity.x              = oneOverDeltaT*(xPrime.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
+        velocity.y              = oneOverDeltaT*(xPrime.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
+        velocity.z              = oneOverDeltaT*(xPrime.z);
        xPrime.x               += apos.x;
        xPrime.y               += apos.y;

--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -802,12 +802,11 @@ void ReferenceIntegrateVariableVerletStepKernel::initialize(const System& system
 }
 void ReferenceIntegrateVariableVerletStepKernel::execute(OpenMMContextImpl& context, const VariableVerletIntegrator& integrator) {
-    double stepSize = integrator.getStepSize();
    double errorTol = integrator.getErrorTolerance();
    RealOpenMM** posData = ((ReferenceFloatStreamImpl&) context.getPositions().getImpl()).getData();
    RealOpenMM** velData = ((ReferenceFloatStreamImpl&) context.getVelocities().getImpl()).getData();
    RealOpenMM** forceData = const_cast<RealOpenMM**>(((ReferenceFloatStreamImpl&) context.getForces().getImpl()).getData()); // Reference code needs to be made const correct
-    if (dynamics == 0 || stepSize != prevStepSize || errorTol != prevErrorTol) {
+    if (dynamics == 0 || errorTol != prevErrorTol) {
        // Recreate the computation objects with the new parameters.
        if (dynamics) {
@@ -819,7 +818,6 @@ void ReferenceIntegrateVariableVerletStepKernel::execute(OpenMMContextImpl& cont
        findAnglesForCCMA(context.getSystem(), angles);
        constraints = new ReferenceCCMAAlgorithm(context.getSystem().getNumParticles(), numConstraints, constraintIndices, constraintDistances, masses, angles, (RealOpenMM)integrator.getConstraintTolerance());
        dynamics->setReferenceConstraintAlgorithm(constraints);
-        prevStepSize = stepSize;
        prevErrorTol = errorTol;
    }
    dynamics->update(context.getSystem().getNumParticles(), posData, velData, forceData, masses);

--- a/platforms/reference/src/ReferenceKernels.h
+++ b/platforms/reference/src/ReferenceKernels.h
@@ -472,7 +472,7 @@ private:
    RealOpenMM* constraintDistances;
    int** constraintIndices;
    int numConstraints;
-    double prevStepSize, prevErrorTol;
+    double prevErrorTol;
 };
 /**