Optimizations to the CUDA implementation of CCMA

6768d049 · Peter Eastman · c961d453 · 6768d049 · 6768d049 · 6768d049
Commit 6768d049 authored Apr 10, 2010 by Peter Eastman
4 changed files
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -460,7 +460,7 @@ struct cudaGmxSimulation {
    float*          pCcmaDelta2;                    // Workspace for CCMA
    int*            pCcmaAtomConstraints;           // The indices of constraints involving each atom
    int*            pCcmaNumAtomConstraints;        // The number of constraints involving each atom
-    int*            pCcmaConverged;                 // Used by CCMA to communicate whether iteration has converged
+    int*            ccmaConvergedDeviceMarker;      // Device memory used to communicate that CCMA has converged
    float*          pCcmaReducedMass;               // The reduced mass for each CCMA constraint
    unsigned int*   pConstraintMatrixColumn;        // The column of each element in the constraint matrix.
    float*          pConstraintMatrixValue;         // The value of each element in the constraint matrix.

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1488,7 +1488,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
            result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
            for (int j = 0; j < numCCMA; j++) {
                double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
-                if (abs(value) > 0.1)
+                if (abs(value) > 0.05)
                    matrix[j].push_back(pair<int, double>(i, value));
            }
        }
@@ -1533,9 +1533,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2");
    gpu->psCcmaDelta2             = psCcmaDelta2;
    gpu->sim.pCcmaDelta2          = psCcmaDelta2->_pDevData;
-    CUDAStream<int>* psCcmaConverged = new CUDAStream<int>(gpu->sim.blocks, 1, "CcmaConverged");
-    gpu->psCcmaConverged             = psCcmaConverged;
-    gpu->sim.pCcmaConverged          = psCcmaConverged->_pDevData;
    CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass");
    gpu->psCcmaReducedMass             = psCcmaReducedMass;
    gpu->sim.pCcmaReducedMass          = psCcmaReducedMass->_pDevData;
@@ -1545,6 +1542,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    CUDAStream<float>* psConstraintMatrixValue = new CUDAStream<float>(numCCMA*maxRowElements, 1, "ConstraintMatrixValue");
    gpu->psConstraintMatrixValue             = psConstraintMatrixValue;
    gpu->sim.pConstraintMatrixValue          = psConstraintMatrixValue->_pDevData;
+    cudaHostAlloc((void**) &gpu->ccmaConvergedHostMarker, sizeof(int), cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**) &gpu->sim.ccmaConvergedDeviceMarker, (void*) gpu->ccmaConvergedHostMarker, 0);
+    cudaEventCreate(&gpu->ccmaEvent);
    gpu->sim.ccmaConstraints = numCCMA;
    for (int i = 0; i < numCCMA; i++) {
        int index = constraintOrder[i];
@@ -1802,7 +1802,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
        cudaSetDevice(device); // Ignore errors
    status = cudaGetDevice(&gpu->device);
    RTERROR(status, "Error getting CUDA device")
-    status = cudaSetDeviceFlags(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto);
+    status = cudaSetDeviceFlags(cudaDeviceMapHost+(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto));
    RTERROR(status, "Error setting device flags")
    gpu->useBlockingSync = useBlockingSync;
@@ -1988,7 +1988,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
    gpu->psCcmaNumAtomConstraints   = NULL;
    gpu->psCcmaDelta1               = NULL;
    gpu->psCcmaDelta2               = NULL;
-    gpu->psCcmaConverged            = NULL;
    gpu->psCcmaReducedMass          = NULL;
    gpu->psConstraintMatrixColumn   = NULL;
    gpu->psConstraintMatrixValue    = NULL;
@@ -2207,8 +2206,8 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psCcmaNumAtomConstraints;
    delete gpu->psCcmaDelta1;
    delete gpu->psCcmaDelta2;
-    delete gpu->psCcmaConverged;
    delete gpu->psCcmaReducedMass;
+    cudaEventDestroy(gpu->ccmaEvent);
    delete gpu->psConstraintMatrixColumn;
    delete gpu->psConstraintMatrixValue;
    delete gpu->psTabulatedFunctionParams;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -171,7 +171,8 @@ struct _gpuContext {
    CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom
    CUDAStream<float>* psCcmaDelta1;        // Workspace for CCMA
    CUDAStream<float>* psCcmaDelta2;        // Workspace for CCMA
-    CUDAStream<int>* psCcmaConverged;       // Used by CCMA to communicate whether iteration has converged
+    int* ccmaConvergedHostMarker;           // Host memory used to communicate that CCMA has converged
+    cudaEvent_t ccmaEvent;                  // Used to optimize communication during CCMA
    CUDAStream<float>* psCcmaReducedMass;   // The reduced mass for each CCMA constraint
    CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
    CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.

--- a/platforms/cuda/src/kernels/kCCMA.cu
+++ b/platforms/cuda/src/kernels/kCCMA.cu
@@ -71,11 +71,6 @@ kComputeCCMAConstraintDirections()
        dir.z = oldPos1.z-oldPos2.z;
        cSim.pCcmaDistance[index] = dir;
    }
-    // Mark that no blocks have converged yet.
-    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < gridDim.x; index += blockDim.x*gridDim.x)
-        cSim.pCcmaConverged[index] = 0;
 }
 __global__ void
@@ -88,12 +83,12 @@ __launch_bounds__(256, 1)
 #endif
 kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
 {
-    if (cSim.pCcmaConverged[blockIdx.x])
+    __shared__ int converged;
-        return; // The constraint iteration has already converged.
-    extern __shared__ int convergedBuffer[];
    float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
    float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
-    int threadConverged = 1;
+    if (threadIdx.x == 0)
+        converged = 1;
+    __syncthreads();
    // Calculate the constraint force for each constraint.
@@ -120,20 +115,12 @@ kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
        // See whether it has converged.
-        threadConverged &= (rp2 >= lowerTol*dist2 && rp2 <= upperTol*dist2);
+        if (converged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
-    }
+        {
+            converged = 0;
-    // Perform a parallel reduction to see if all constraints handled by this block have converged.
+            *cSim.ccmaConvergedDeviceMarker = 0;
+        }
-    convergedBuffer[threadIdx.x] = threadConverged;
-    __syncthreads();
-    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
-        if (threadIdx.x%(2*step) == 0)
-            convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
-        __syncthreads();
    }
-    if (threadIdx.x == 0)
-        cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
 }
 __global__ void
@@ -146,22 +133,8 @@ __launch_bounds__(256, 1)
 #endif
 kMultiplyByCCMAConstraintMatrix()
 {
-    extern __shared__ int convergedBuffer[];
+    if (*cSim.ccmaConvergedDeviceMarker)
-    // First see whether all work groups have converged.
+        return; // The constraint iteration has already converged
-    convergedBuffer[threadIdx.x] = true;
-    for (int index = threadIdx.x; index < gridDim.x; index += blockDim.x)
-        convergedBuffer[threadIdx.x] &= cSim.pCcmaConverged[index];
-    __syncthreads();
-    for (int step = 1; step < blockDim.x; step *= 2) {
-        if (threadIdx.x%(2*step) == 0)
-            convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
-    if (cSim.pCcmaConverged[0])
-        return; // The constraint iteration has already converged.
    // Multiply by the inverse constraint matrix.
@@ -190,7 +163,7 @@ __launch_bounds__(256, 1)
 #endif
 kUpdateCCMAAtomPositions(float4* atomPositions, int iteration)
 {
-    if (cSim.pCcmaConverged[blockIdx.x])
+    if (*cSim.ccmaConvergedDeviceMarker)
        return; // The constraint iteration has already converged.
    float damping = (iteration < 2 ? 0.5f : 1.0f);
    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x)
@@ -218,13 +191,17 @@ void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
 {
    kComputeCCMAConstraintDirections<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
    LAUNCHERROR("kComputeCCMAConstraintDirections");
+    const int checkInterval = 3;
    for (int i = 0; i < 150; i++) {
+        if ((i+1)%checkInterval == 0)
+            *gpu->ccmaConvergedHostMarker = 1;
        kComputeCCMAConstraintForces<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>(posq, addOldPosition);
-        gpu->psCcmaConverged->Download();
+        cudaEventRecord(gpu->ccmaEvent, 0);
        kMultiplyByCCMAConstraintMatrix<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>();
-        if ((*gpu->psCcmaConverged)[0])
+        kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, 3*i+2);
+        cudaEventSynchronize(gpu->ccmaEvent);
+        if ((i+1)%checkInterval == 0 && *gpu->ccmaConvergedHostMarker)
            break;
-        kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, i);
    }
 }