Restructured CCMA code to work on Fermi boards

1d0193eb · Peter Eastman · 5301b9cc · 1d0193eb · 1d0193eb · 1d0193eb
Commit 1d0193eb authored Apr 09, 2010 by Peter Eastman
4 changed files
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -460,8 +460,7 @@ struct cudaGmxSimulation {
    float*          pCcmaDelta2;                    // Workspace for CCMA
    int*            pCcmaAtomConstraints;           // The indices of constraints involving each atom
    int*            pCcmaNumAtomConstraints;        // The number of constraints involving each atom
-    short*          pSyncCounter;                   // Used for global thread synchronization
-    unsigned int*   pRequiredIterations;            // Used by CCMA to communicate whether iteration has converged
+    int*            pCcmaConverged;                 // Used by CCMA to communicate whether iteration has converged
    float*          pCcmaReducedMass;               // The reduced mass for each CCMA constraint
    unsigned int*   pConstraintMatrixColumn;        // The column of each element in the constraint matrix.
    float*          pConstraintMatrixValue;         // The value of each element in the constraint matrix.

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1533,12 +1533,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2");
    gpu->psCcmaDelta2             = psCcmaDelta2;
    gpu->sim.pCcmaDelta2          = psCcmaDelta2->_pDevData;
-    CUDAStream<short>* psSyncCounter = new CUDAStream<short>(3*gpu->sim.blocks, 1, "SyncCounter");
-    gpu->psSyncCounter               = psSyncCounter;
-    gpu->sim.pSyncCounter            = psSyncCounter->_pDevData;
-    CUDAStream<unsigned int>* psRequiredIterations = new CUDAStream<unsigned int>(1, 1, "RequiredIterations");
-    gpu->psRequiredIterations               = psRequiredIterations;
-    gpu->sim.pRequiredIterations            = psRequiredIterations->_pDevData;
+    CUDAStream<int>* psCcmaConverged = new CUDAStream<int>(gpu->sim.blocks, 1, "CcmaConverged");
+    gpu->psCcmaConverged             = psCcmaConverged;
+    gpu->sim.pCcmaConverged          = psCcmaConverged->_pDevData;
    CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass");
    gpu->psCcmaReducedMass             = psCcmaReducedMass;
    gpu->sim.pCcmaReducedMass          = psCcmaReducedMass->_pDevData;
@@ -1562,8 +1559,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
        }
        (*psConstraintMatrixColumn)[i+matrix[index].size()*numCCMA] = numCCMA;
    }
-    for (unsigned int i = 0; i < psSyncCounter->_length; i++)
-        (*psSyncCounter)[i] = -1;
    for (unsigned int i = 0; i < atomConstraints.size(); i++) {
        (*psCcmaNumAtomConstraints)[i] = atomConstraints[i].size();
        for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
@@ -1576,7 +1571,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    psCcmaReducedMass->Upload();
    psCcmaAtomConstraints->Upload();
    psCcmaNumAtomConstraints->Upload();
-    psSyncCounter->Upload();
    psConstraintMatrixColumn->Upload();
    psConstraintMatrixValue->Upload();
    gpu->sim.ccma_threads_per_block = (gpu->sim.ccmaConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
@@ -1994,8 +1988,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
    gpu->psCcmaNumAtomConstraints   = NULL;
    gpu->psCcmaDelta1               = NULL;
    gpu->psCcmaDelta2               = NULL;
-    gpu->psSyncCounter              = NULL;
-    gpu->psRequiredIterations       = NULL;
+    gpu->psCcmaConverged            = NULL;
    gpu->psCcmaReducedMass          = NULL;
    gpu->psConstraintMatrixColumn   = NULL;
    gpu->psConstraintMatrixValue    = NULL;
@@ -2214,8 +2207,7 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psCcmaNumAtomConstraints;
    delete gpu->psCcmaDelta1;
    delete gpu->psCcmaDelta2;
-    delete gpu->psSyncCounter;
-    delete gpu->psRequiredIterations;
+    delete gpu->psCcmaConverged;
    delete gpu->psCcmaReducedMass;
    delete gpu->psConstraintMatrixColumn;
    delete gpu->psConstraintMatrixValue;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -171,9 +171,8 @@ struct _gpuContext {
    CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom
    CUDAStream<float>* psCcmaDelta1;        // Workspace for CCMA
    CUDAStream<float>* psCcmaDelta2;        // Workspace for CCMA
-    CUDAStream<short>* psSyncCounter;       // Used for global thread synchronization
-    CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
-    CUDAStream<float>* psCcmaReducedMass;   // The reduced mass for each SHAKE constraint
+    CUDAStream<int>* psCcmaConverged;       // Used by CCMA to communicate whether iteration has converged
+    CUDAStream<float>* psCcmaReducedMass;   // The reduced mass for each CCMA constraint
    CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
    CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
    CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.

--- a/platforms/cuda/src/kernels/kCCMA.cu
+++ b/platforms/cuda/src/kernels/kCCMA.cu
@@ -48,25 +48,7 @@ void GetCCMASim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-/**
- * Synchronize all threads across all blocks.
- */
-__device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount)
-{
-    __syncthreads();
-    if (threadIdx.x == 0)
-        syncCounter[blockIdx.x] = newCount;
-    if (threadIdx.x < gridDim.x)
-    {
-        volatile short* counter = &syncCounter[threadIdx.x];
-        do
-        {
-        } while (*counter != newCount);
-    }
-    __syncthreads();
-}
-
-__global__ void 
+__global__ void
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(1024, 1)
 #elif (__CUDA_ARCH__ >= 130)
@@ -74,148 +56,188 @@ __launch_bounds__(512, 1)
 #else
 __launch_bounds__(256, 1)
 #endif
-kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition)
+kComputeCCMAConstraintDirections()
 {
-    // Initialize counters used for monitoring convergence and doing global thread synchronization.
-
-    __shared__ unsigned int requiredIterations;
-    if (threadIdx.x == 0)
-    {
-        requiredIterations = 0;
-        cSim.pSyncCounter[gridDim.x+blockIdx.x] = -1;
-        cSim.pSyncCounter[2*gridDim.x+blockIdx.x] = -1;
-        cSim.pRequiredIterations[0] = 0;
-    }
-
    // Calculate the direction of each constraint.

-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.ccmaConstraints)
+    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
    {
-        int2 atoms = cSim.pCcmaAtoms[pos];
-        float4 dir = cSim.pCcmaDistance[pos];
+        int2 atoms = cSim.pCcmaAtoms[index];
+        float4 dir = cSim.pCcmaDistance[index];
        float4 oldPos1 = cSim.pOldPosq[atoms.x];
        float4 oldPos2 = cSim.pOldPosq[atoms.y];
        dir.x = oldPos1.x-oldPos2.x;
        dir.y = oldPos1.y-oldPos2.y;
        dir.z = oldPos1.z-oldPos2.z;
-        cSim.pCcmaDistance[pos] = dir;
-        pos += blockDim.x*gridDim.x;
+        cSim.pCcmaDistance[index] = dir;
    }
-    __syncthreads();

-    // Iteratively update the atom positions.
+    // Mark that no blocks have converged yet.

-    unsigned int maxIterations = 150;
+    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < gridDim.x; index += blockDim.x*gridDim.x)
+        cSim.pCcmaConverged[index] = false;
+}
+
+__global__ void
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
+{
+    if (cSim.pCcmaConverged[blockIdx.x])
+        return; // The constraint iteration has already converged.
+    extern __shared__ int convergedBuffer[];
    float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
    float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
-    for (unsigned int iteration = 0; iteration < maxIterations; iteration++)
-    {
-        // Calculate the constraint force for each constraint.
+    int threadConverged = 1;

-        pos = threadIdx.x + blockIdx.x * blockDim.x;
-        while (pos < cSim.ccmaConstraints)
+    // Calculate the constraint force for each constraint.
+
+    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
+    {
+        int2 atoms = cSim.pCcmaAtoms[index];
+        float4 delta1 = atomPositions[atoms.x];
+        float4 delta2 = atomPositions[atoms.y];
+        float4 dir = cSim.pCcmaDistance[index];
+        float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
+        if (addOldPosition)
        {
-            int2 atoms = cSim.pCcmaAtoms[pos];
-            float4 delta1 = atomPositions[atoms.x];
-            float4 delta2 = atomPositions[atoms.y];
-            float4 dir = cSim.pCcmaDistance[pos];
-            float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
-            if (addOldPosition)
-            {
-                rp_ij.x += dir.x;
-                rp_ij.y += dir.y;
-                rp_ij.z += dir.z;
-            }
-            float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
-            float dist2 = dir.w*dir.w;
-            float diff = dist2 - rp2;
-            float rrpr  = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
-            float d_ij2  = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
-            float reducedMass = cSim.pCcmaReducedMass[pos];
-            cSim.pCcmaDelta1[pos] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
-            if (requiredIterations == iteration && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
-                requiredIterations = iteration+1;
-            pos += blockDim.x * gridDim.x;
+            rp_ij.x += dir.x;
+            rp_ij.y += dir.y;
+            rp_ij.z += dir.z;
        }
+        float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
+        float dist2 = dir.w*dir.w;
+        float diff = dist2 - rp2;
+        float rrpr  = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
+        float d_ij2  = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
+        float reducedMass = cSim.pCcmaReducedMass[index];
+        cSim.pCcmaDelta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
+
+        // See whether it has converged.
+
+        threadConverged &= (rp2 >= lowerTol*dist2 && rp2 <= upperTol*dist2);
+    }
+
+    // Perform a parallel reduction to see if all constraints handled by this block have converged.
+
+    convergedBuffer[threadIdx.x] = threadConverged;
+    __syncthreads();
+    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+        if (threadIdx.x%(2*step) == 0)
+            convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
        __syncthreads();
-        if (threadIdx.x == 0 && requiredIterations > iteration)
-            cSim.pRequiredIterations[0] = requiredIterations;
-        kSyncAllThreads_kernel(cSim.pSyncCounter, iteration);
-        if (iteration == cSim.pRequiredIterations[0])
-            break; // All constraints have converged.
+    }
+    if (threadIdx.x == 0)
+        cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
+}

-        // Multiply by the inverse constraint matrix.
+__global__ void
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+kMultiplyByCCMAConstraintMatrix()
+{
+    extern __shared__ int convergedBuffer[];
+    // First see whether all work groups have converged.

-        pos = threadIdx.x + blockIdx.x * blockDim.x;
-        while (pos < cSim.ccmaConstraints)
-        {
-            float sum = 0.0f;
-            for (unsigned int i = 0; ; i++)
-            {
-                unsigned int index = pos+i*cSim.ccmaConstraints;
-                unsigned int column = cSim.pConstraintMatrixColumn[index];
-                if (column >= cSim.ccmaConstraints)
-                    break;
-                sum += cSim.pCcmaDelta1[column]*cSim.pConstraintMatrixValue[index];
-            }
-            cSim.pCcmaDelta2[pos] = sum;
-            pos += blockDim.x * gridDim.x;
-        }
-        kSyncAllThreads_kernel(&cSim.pSyncCounter[gridDim.x], iteration);
+    convergedBuffer[threadIdx.x] = true;
+    for (int index = threadIdx.x; index < gridDim.x; index += blockDim.x)
+        convergedBuffer[threadIdx.x] &= cSim.pCcmaConverged[index];
+    __syncthreads();
+    for (int step = 1; step < blockDim.x; step *= 2) {
+        if (threadIdx.x%(2*step) == 0)
+            convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
+    if (cSim.pCcmaConverged[0])
+        return; // The constraint iteration has already converged.

-        // Update the position of each atom.
+    // Multiply by the inverse constraint matrix.

-        pos = threadIdx.x + blockIdx.x * blockDim.x;
-        float damping = (iteration < 2 ? 0.5f : 1.0f);
-        while (pos < cSim.atoms)
+    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
+    {
+        float sum = 0.0f;
+        for (unsigned int i = 0; ; i++)
        {
-            float4 atomPos = atomPositions[pos];
-            float invMass = cSim.pVelm4[pos].w;
-            int num = cSim.pCcmaNumAtomConstraints[pos];
-            for (int i = 0; i < num; i++)
-            {
-                int index = pos+i*cSim.atoms;
-                int constraint = cSim.pCcmaAtomConstraints[index];
-                bool forward = (constraint > 0);
-                constraint = (forward ? constraint-1 : -constraint-1);
-                float constraintForce = damping*invMass*cSim.pCcmaDelta2[constraint];
-                constraintForce = (forward ? constraintForce : -constraintForce);
-                float4 dir = cSim.pCcmaDistance[constraint];
-                atomPos.x += constraintForce*dir.x;
-                atomPos.y += constraintForce*dir.y;
-                atomPos.z += constraintForce*dir.z;
-            }
-            atomPositions[pos] = atomPos;
-            pos += blockDim.x*gridDim.x;
+            unsigned int element = index+i*cSim.ccmaConstraints;
+            unsigned int column = cSim.pConstraintMatrixColumn[element];
+            if (column >= cSim.ccmaConstraints)
+                break;
+            sum += cSim.pCcmaDelta1[column]*cSim.pConstraintMatrixValue[element];
        }
-        if (threadIdx.x == 0)
-            requiredIterations = iteration+1;
-        kSyncAllThreads_kernel(&cSim.pSyncCounter[2*gridDim.x], iteration);
+        cSim.pCcmaDelta2[index] = sum;
    }
+}

-    // Reset the initial sync counter to be ready for the next call.
+__global__ void
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+kUpdateCCMAAtomPositions(float4* atomPositions, int iteration)
+{
+    if (cSim.pCcmaConverged[blockIdx.x])
+        return; // The constraint iteration has already converged.
+    float damping = (iteration < 2 ? 0.5f : 1.0f);
+    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x)
+    {
+        float4 atomPos = atomPositions[index];
+        float invMass = cSim.pVelm4[index].w;
+        int num = cSim.pCcmaNumAtomConstraints[index];
+        for (int i = 0; i < num; i++)
+        {
+            int constraint = cSim.pCcmaAtomConstraints[index+i*cSim.atoms];
+            bool forward = (constraint > 0);
+            constraint = (forward ? constraint-1 : -constraint-1);
+            float constraintForce = damping*invMass*cSim.pCcmaDelta2[constraint];
+            constraintForce = (forward ? constraintForce : -constraintForce);
+            float4 dir = cSim.pCcmaDistance[constraint];
+            atomPos.x += constraintForce*dir.x;
+            atomPos.y += constraintForce*dir.y;
+            atomPos.z += constraintForce*dir.z;
+        }
+        atomPositions[index] = atomPos;
+    }
+}

-    if (threadIdx.x == 0)
-        cSim.pSyncCounter[blockIdx.x] = -1;
+void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
+{
+    kComputeCCMAConstraintDirections<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
+    LAUNCHERROR("kComputeCCMAConstraintDirections");
+    for (int i = 0; i < 150; i++) {
+        kComputeCCMAConstraintForces<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, addOldPosition);
+        kMultiplyByCCMAConstraintMatrix<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
+        gpu->psCcmaConverged->Download();
+        if ((*gpu->psCcmaConverged)[0])
+            break;
+        kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, i);
+    }
 }

 void kApplyFirstCCMA(gpuContext gpu)
 {
 //    printf("kApplyFirstCCMA\n");
    if (gpu->sim.ccmaConstraints > 0)
-    {
-        kApplyCCMA_kernel<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(gpu->sim.pPosqP, true);
-        LAUNCHERROR("kApplyCCMA");
-    }
+        kApplyCCMA(gpu, gpu->sim.pPosqP, true);
 }

 void kApplySecondCCMA(gpuContext gpu)
 {
 //    printf("kApplySecondCCMA\n");
    if (gpu->sim.ccmaConstraints > 0)
-    {
-        kApplyCCMA_kernel<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(gpu->sim.pPosq, false);
-        LAUNCHERROR("kApplyCCMA");
-    }
+        kApplyCCMA(gpu, gpu->sim.pPosq, false);
 }