Began CUDA implementation of C-SHAKE algorithm

b2da8475 · Peter Eastman · b1cffecf · b2da8475 · b2da8475 · b2da8475
Commit b2da8475 authored May 06, 2009 by Peter Eastman
7 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -442,7 +442,7 @@ void CudaIntegrateVerletStepKernel::execute(OpenMMContextImpl& context, const Ve
    kVerletUpdatePart1(gpu);
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
-    kApplyFirstLincs(gpu);
+    kApplyFirstCShake(gpu);
    if (data.removeCM) {
        int step = (int) (context.getTime()/stepSize);
        if (step%data.cmMotionFrequency == 0)
@@ -481,7 +481,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
    kUpdatePart1(gpu);
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
-    kApplyFirstLincs(gpu);
+    kApplyFirstCShake(gpu);
    if (data.removeCM) {
        int step = (int) (context.getTime()/stepSize);
        if (step%data.cmMotionFrequency == 0)
@@ -490,7 +490,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
    kUpdatePart2(gpu);
    kApplySecondShake(gpu);
    kApplySecondSettle(gpu);
-    kApplySecondLincs(gpu);
+    kApplySecondCShake(gpu);
 }

 CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
@@ -523,7 +523,7 @@ void CudaIntegrateBrownianStepKernel::execute(OpenMMContextImpl& context, const
    kBrownianUpdatePart1(gpu);
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
-    kApplyFirstLincs(gpu);
+    kApplyFirstCShake(gpu);
    if (data.removeCM) {
        int step = (int) (context.getTime()/stepSize);
        if (step%data.cmMotionFrequency == 0)

--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -47,10 +47,12 @@ extern void kCalculateAndersenThermostat(gpuContext gpu);
 extern void kReduceBornSumAndForces(gpuContext gpu);
 extern void kUpdatePart1(gpuContext gpu);
 extern void kApplyFirstShake(gpuContext gpu);
+extern void kApplyFirstCShake(gpuContext gpu);
 extern void kApplyFirstSettle(gpuContext gpu);
 extern void kApplyFirstLincs(gpuContext gpu);
 extern void kUpdatePart2(gpuContext gpu);
 extern void kApplySecondShake(gpuContext gpu);
+extern void kApplySecondCShake(gpuContext gpu);
 extern void kApplySecondSettle(gpuContext gpu);
 extern void kApplySecondLincs(gpuContext gpu);
 extern void kVerletUpdatePart1(gpuContext gpu);
@@ -81,6 +83,8 @@ extern void SetUpdateShakeHSim(gpuContext gpu);
 extern void GetUpdateShakeHSim(gpuContext gpu);
 extern void SetSettleSim(gpuContext gpu);
 extern void GetSettleSim(gpuContext gpu);
+extern void SetCShakeSim(gpuContext gpu);
+extern void GetCShakeSim(gpuContext gpu);
 extern void SetLincsSim(gpuContext gpu);
 extern void GetLincsSim(gpuContext gpu);
 extern void SetVerletUpdateSim(gpuContext gpu);

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -389,6 +389,8 @@ struct cudaGmxSimulation {
    int*            pLincsAtomConstraints;          // The indices of constraints involving each atom
    int*            pLincsNumAtomConstraints;       // The number of constraints involving each atom
    short*          pSyncCounter;                   // Used for global thread synchronization
+    unsigned int*   pRequiredIterations;            // Used by SHAKE to communicate whether iteration has converged
+    float*          pShakeReducedMass;              // The reduced mass for each SHAKE constraint

    // Mutable stuff
    float4*         pPosq;                          // Pointer to atom positions and charges

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -733,6 +733,12 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    CUDAStream<short>* psSyncCounter = new CUDAStream<short>(2*gpu->sim.blocks, 1, "SyncCounter");
    gpu->psSyncCounter               = psSyncCounter;
    gpu->sim.pSyncCounter            = psSyncCounter->_pDevData;
+    CUDAStream<unsigned int>* psRequiredIterations = new CUDAStream<unsigned int>(1, 1, "RequiredIterations");
+    gpu->psRequiredIterations               = psRequiredIterations;
+    gpu->sim.pRequiredIterations            = psRequiredIterations->_pDevData;
+    CUDAStream<float>* psShakeReducedMass = new CUDAStream<float>(numLincs, 1, "LincsSolution");
+    gpu->psShakeReducedMass             = psShakeReducedMass;
+    gpu->sim.pShakeReducedMass          = psShakeReducedMass->_pDevData;
    gpu->sim.lincsConstraints = numLincs;
    for (int i = 0; i < numLincs; i++) {
        int c = lincsConstraints[i];
@@ -740,6 +746,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
        (*psLincsAtoms)[i].y = atom2[c];
        (*psLincsDistance)[i].w = distance[c];
        (*psLincsS)[i] = 1.0f/sqrt(invMass1[c]+invMass2[c]);
+        (*psShakeReducedMass)[i] = 0.5f/(invMass1[c]+invMass2[c]);
        (*psLincsNumConnections)[i] = linkedConstraints[i].size();
        for (unsigned int j = 0; j < linkedConstraints[i].size(); j++)
            (*psLincsConnections)[i+j*numLincs] = linkedConstraints[i][j];
@@ -754,6 +761,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    psLincsAtoms->Upload();
    psLincsDistance->Upload();
    psLincsS->Upload();
+    psShakeReducedMass->Upload();
    psLincsConnections->Upload();
    psLincsNumConnections->Upload();
    psLincsAtomConstraints->Upload();
@@ -761,8 +769,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    psSyncCounter->Upload();
    gpu->sim.lincsTerms = lincsTerms;
    gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
-    if (gpu->sim.lincs_threads_per_block > gpu->sim.max_shake_threads_per_block)
-        gpu->sim.lincs_threads_per_block = gpu->sim.max_shake_threads_per_block;
+    if (gpu->sim.lincs_threads_per_block > gpu->sim.threads_per_block)
+        gpu->sim.lincs_threads_per_block = gpu->sim.threads_per_block;
    if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
        gpu->sim.lincs_threads_per_block = gpu->sim.blocks;

@@ -1142,6 +1150,8 @@ void* gpuInit(int numAtoms)
    gpu->psLincsRhs2                = NULL;
    gpu->psLincsSolution            = NULL;
    gpu->psSyncCounter              = NULL;
+    gpu->psRequiredIterations       = NULL;
+    gpu->psShakeReducedMass         = NULL;

    // Initialize output buffer before reading parameters
    gpu->pOutputBufferCounter       = new unsigned int[gpu->sim.paddedNumberOfAtoms];
@@ -1292,6 +1302,8 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psLincsRhs2;
    delete gpu->psLincsSolution;
    delete gpu->psSyncCounter;
+    delete gpu->psRequiredIterations;
+    delete gpu->psShakeReducedMass;
    if (gpu->cudpp != 0)
        cudppDestroyPlan(gpu->cudpp);

@@ -1601,6 +1613,7 @@ int gpuSetConstants(gpuContext gpu)
    SetVerletUpdateSim(gpu);
    SetBrownianUpdateSim(gpu);
    SetSettleSim(gpu);
+    SetCShakeSim(gpu);
    SetLincsSim(gpu);
    SetRandomSim(gpu);
    return 1;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -141,6 +141,8 @@ struct _gpuContext {
    CUDAStream<float>* psLincsRhs2;         // Workspace for LINCS
    CUDAStream<float>* psLincsSolution;     // Workspace for LINCS
    CUDAStream<short>* psSyncCounter;       // Used for global thread synchronization
+    CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
+    CUDAStream<float>* psShakeReducedMass;  // The reduced mass for each SHAKE constraint
 };

 typedef struct _gpuContext *gpuContext;

--- a/platforms/cuda/tests/TestCudaVerletIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaVerletIntegrator.cpp
@@ -138,10 +138,73 @@ void testConstraints() {
    }
 }

+void testConstrainedClusters() {
+    const int numParticles = 7;
+    const double temp = 500.0;
+    CudaPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.002);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(i > 1 ? 1.0 : 10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    system.addConstraint(0, 1, 1.0);
+    system.addConstraint(0, 2, 1.0);
+    system.addConstraint(0, 3, 1.0);
+    system.addConstraint(0, 4, 1.0);
+    system.addConstraint(1, 5, 1.0);
+    system.addConstraint(1, 6, 1.0);
+    system.addConstraint(2, 3, sqrt(2.0));
+    system.addConstraint(2, 4, sqrt(2.0));
+    system.addConstraint(3, 4, sqrt(2.0));
+    system.addConstraint(5, 6, sqrt(2.0));
+    system.addForce(forceField);
+    OpenMMContext context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    positions[2] = Vec3(-1, 0, 0);
+    positions[3] = Vec3(0, 1, 0);
+    positions[4] = Vec3(0, 0, 1);
+    positions[5] = Vec3(2, 0, 0);
+    positions[6] = Vec3(1, 1, 0);
+    vector<Vec3> velocities(numParticles);
+    init_gen_rand(0);
+    for (int i = 0; i < numParticles; ++i)
+        velocities[i] = Vec3(genrand_real2()-0.5, genrand_real2()-0.5, genrand_real2()-0.5);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+
+    // Simulate it and see whether the constraints remain satisfied.
+
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Energy);
+        for (int j = 0; j < system.getNumConstraints(); ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+        }
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        if (i == 1)
+            initialEnergy = energy;
+        else if (i > 1)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.05);
+        integrator.step(1);
+    }
+}
+
 int main() {
    try {
        testSingleBond();
        testConstraints();
+        testConstrainedClusters();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/reference/src/SimTKReference/ReferenceRigidShakeAlgorithm.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceRigidShakeAlgorithm.cpp
@@ -183,7 +183,7 @@ ReferenceRigidShakeAlgorithm::~ReferenceRigidShakeAlgorithm( ){
       SimTKOpenMMUtilities::freeOneDRealOpenMMArray( _distanceTolerance, "distanceTolerance" );
       SimTKOpenMMUtilities::freeOneDRealOpenMMArray( _reducedMasses, "reducedMasses" );
    }
-    for (int i = 0; i < (int)_matrices.size(); i++)
+    for (unsigned int i = 0; i < _matrices.size(); i++)
        SimTKOpenMMUtilities::freeTwoDRealOpenMMArray(_matrices[i], "");
 }

@@ -346,13 +346,13 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
         reducedMasses[ii]  = half/( inverseMasses[atomI] + inverseMasses[atomJ] );
      }
      vector<double> temp;
-      for (int i = 0; i < (int)_rigidClusters.size(); i++) {
+      for (unsigned int i = 0; i < _rigidClusters.size(); i++) {
          // Compute the constraint coupling matrix for this cluster.

          const vector<int>& cluster = _rigidClusters[i];
-          int size = cluster.size();
+          unsigned int size = cluster.size();
          vector<Vec3> r(size);
-          for (int j = 0; j < (int)cluster.size(); j++) {
+          for (unsigned int j = 0; j < cluster.size(); j++) {
              int atom1 = _atomIndices[cluster[j]][0];
              int atom2 = _atomIndices[cluster[j]][1];
              r[j] = Vec3(atomCoordinates[atom1][0]-atomCoordinates[atom2][0],
@@ -397,7 +397,7 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
          double singularValueCutoff = 0.01*w[0];
          for (int j = 0; j < size; j++)
              w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]);
-          if ((int)temp.size() < size)
+          if (temp.size() < size)
              temp.resize(size);
          for (int j = 0; j < size; j++) {
              for (int k = 0; k < size; k++) {
@@ -458,7 +458,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
         RealOpenMM diff = dist2 - rp2;
         constraintForce[ii] = zero;
         RealOpenMM rrpr  = DOT3(  rp_ij, r_ij[ii] );
-
         if( rrpr <  d_ij2[ii]*epsilon6 ){
             std::stringstream message;
             message << iterations <<" "<<atomI<<" "<<atomJ<< " Error: sign of rrpr < 0?\n";
@@ -473,11 +472,11 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
      if( numberConverged == _numberOfConstraints ){
         done = true;
      }
-      for (int i = 0; i < (int)_rigidClusters.size(); i++) {
+      for (unsigned int i = 0; i < _rigidClusters.size(); i++) {
          const vector<int>& cluster = _rigidClusters[i];
          RealOpenMM** matrix = _matrices[i];
-          int size = cluster.size();
-          if (size > (int)tempForce.size())
+          unsigned int size = cluster.size();
+          if (size > tempForce.size())
              tempForce.resize(size);
          for (int j = 0; j < size; j++) {
              tempForce[j] = zero;