Optimizations to CUDA C-SHAKE implementation

043c7b6c · Peter Eastman · fe1e6ffa · 043c7b6c · 043c7b6c · 043c7b6c
Commit 043c7b6c authored May 15, 2009 by Peter Eastman
5 changed files
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -389,7 +389,6 @@ struct cudaGmxSimulation {
    short*          pSyncCounter;                   // Used for global thread synchronization
    unsigned int*   pRequiredIterations;            // Used by SHAKE to communicate whether iteration has converged
    float*          pShakeReducedMass;              // The reduced mass for each SHAKE constraint
-    int*            pRigidClusterConstraints;       // The constraints in each rigid cluster
    float*          pRigidClusterMatrix;            // The inverse constraint matrix for each rigid cluster
    unsigned int*   pRigidClusterConstraintIndex;   // The index of each cluster in the stream containing cluster constraints.
    unsigned int*   pRigidClusterMatrixIndex;       // The index of each cluster in the stream containing cluster matrices.

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -464,7 +464,7 @@ static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster
    }
 }
-static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, const vector<int>& secondAtom, const vector<int>& constraintIndices)
+static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, const vector<int>& secondAtom, vector<int>& constraintIndices)
 {
    vector<map<int, int> > atomConstraints(firstAtom.size());
    for (int i = 0; i < (int)constraintIndices.size(); i++) {
@@ -537,11 +537,27 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
        }
    }
+    // Reorder the constraints so those in a cluster are sequential.
+    vector<int> constraintOrder(constraintIndices.size());
+    vector<int> clusterStartIndex(rigidClusters.size());
+    set<int> inCluster;
+    int nextIndex = 0;
+    for (int i = 0; i < (int) rigidClusters.size(); ++i) {
+        clusterStartIndex[i] = nextIndex;
+        for (int j = 0; j < (int) rigidClusters[i].size(); ++j) {
+            int constraint = rigidClusters[i][j];
+            constraintOrder[nextIndex++] = constraint;
+            inCluster.insert(constraint);
+        }
+    }
+    for (int i = 0; i < (int) constraintIndices.size(); ++i)
+        if (inCluster.find(constraintIndices[i]) == inCluster.end())
+            constraintOrder[nextIndex++] = constraintIndices[i];
+    constraintIndices = constraintOrder;
    // Build the CUDA streams.
-    CUDAStream<int>* psRigidClusterConstraints = new CUDAStream<int>(totalConstraints, 1, "RigidClusterConstraints");
-    gpu->psRigidClusterConstraints             = psRigidClusterConstraints;
-    gpu->sim.pRigidClusterConstraints          = psRigidClusterConstraints->_pDevData;
    CUDAStream<unsigned int>* psRigidClusterConstraintIndex = new CUDAStream<unsigned int>((int) rigidClusters.size()+1, 1, "RigidClusterConstraintIndex");
    gpu->psRigidClusterConstraintIndex                      = psRigidClusterConstraintIndex;
    gpu->sim.pRigidClusterConstraintIndex                   = psRigidClusterConstraintIndex->_pDevData;
@@ -556,8 +572,7 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
    for (unsigned int i = 0; i < rigidClusters.size(); i++) {
        vector<int>& cluster = rigidClusters[i];
        (*psRigidClusterConstraintIndex)[i] = constraintIndex;
-        for (unsigned int j = 0; j < cluster.size(); j++)
+        constraintIndex += cluster.size();
-            (*psRigidClusterConstraints)[constraintIndex++] = cluster[j];
        if (cluster.size() > maxClusterSize)
            maxClusterSize = cluster.size();
    }
@@ -567,9 +582,6 @@ static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, cons
    gpu->sim.clusterShakeBlockSize = 1;
    while (gpu->sim.clusterShakeBlockSize < maxClusterSize)
        gpu->sim.clusterShakeBlockSize *= 2;
-    if (gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize != 0)
-        gpu->sim.lincs_threads_per_block += gpu->sim.clusterShakeBlockSize - gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize;
-    psRigidClusterConstraints->Upload();
    psRigidClusterConstraintIndex->Upload();
    gpu->hasInitializedRigidClusters = false;
 }
@@ -776,6 +788,13 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    for (unsigned i = 0; i < atom1.size(); i++)
        if (!isShakeAtom[atom1[i]])
            lincsConstraints.push_back(i);
+    // Identify rigid clusters of atoms.
+    findRigidClusters(gpu, atom1, atom2, lincsConstraints);
+    // Record the connections between constraints.
    int numLincs = (int) lincsConstraints.size();
    vector<vector<int> > atomConstraints(gpu->natoms);
    for (int i = 0; i < numLincs; i++) {
@@ -859,8 +878,10 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
        (*psSyncCounter)[i] = -1;
    for (unsigned int i = 0; i < atomConstraints.size(); i++) {
        (*psLincsNumAtomConstraints)[i] = atomConstraints[i].size();
-        for (unsigned int j = 0; j < atomConstraints[i].size(); j++)
+        for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
-            (*psLincsAtomConstraints)[i+j*gpu->natoms] = atomConstraints[i][j];
+            bool forward = (atom1[lincsConstraints[atomConstraints[i][j]]] == i);
+            (*psLincsAtomConstraints)[i+j*gpu->natoms] = (forward ? atomConstraints[i][j]+1 : -atomConstraints[i][j]-1);
+        }
    }
    psLincsAtoms->Upload();
    psLincsDistance->Upload();
@@ -877,10 +898,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
        gpu->sim.lincs_threads_per_block = gpu->sim.threads_per_block;
    if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
        gpu->sim.lincs_threads_per_block = gpu->sim.blocks;
+    if (gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize != 0)
-    // Identify rigid clusters of atoms.
+        gpu->sim.lincs_threads_per_block += gpu->sim.clusterShakeBlockSize - gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize;
-    findRigidClusters(gpu, atom1, atom2, lincsConstraints);
    // count number of atoms w/o constraint
@@ -1250,7 +1269,6 @@ void* gpuInit(int numAtoms)
    gpu->psSyncCounter              = NULL;
    gpu->psRequiredIterations       = NULL;
    gpu->psShakeReducedMass         = NULL;
-    gpu->psRigidClusterConstraints  = NULL;
    gpu->psRigidClusterConstraintIndex = NULL;
    gpu->psRigidClusterMatrix       = NULL;
    gpu->psRigidClusterMatrixIndex  = NULL;
@@ -1406,7 +1424,6 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psSyncCounter;
    delete gpu->psRequiredIterations;
    delete gpu->psShakeReducedMass;
-    delete gpu->psRigidClusterConstraints;
    delete gpu->psRigidClusterConstraintIndex;
    delete gpu->psRigidClusterMatrix;
    delete gpu->psRigidClusterMatrixIndex;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -139,7 +139,6 @@ struct _gpuContext {
    CUDAStream<short>* psSyncCounter;       // Used for global thread synchronization
    CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
    CUDAStream<float>* psShakeReducedMass;  // The reduced mass for each SHAKE constraint
-    CUDAStream<int>* psRigidClusterConstraints; // The constraints in each rigid cluster
    CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
    CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
    CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.

--- a/platforms/cuda/src/kernels/kCShake.cu
+++ b/platforms/cuda/src/kernels/kCShake.cu
@@ -148,14 +148,13 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
            unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize;
            while (block < cSim.rigidClusters)
            {
-                unsigned int firstIndex = cSim.pRigidClusterConstraintIndex[block];
+                unsigned int firstConstraint = cSim.pRigidClusterConstraintIndex[block];
-                unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstIndex;
+                unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstConstraint;
                if (indexInBlock < blockSize)
                {
                    // Load the constraint forces and matrix.
-                    unsigned int constraint = cSim.pRigidClusterConstraints[firstIndex+indexInBlock];
+                    temp[threadIdx.x] = cSim.pLincsSolution[firstConstraint+indexInBlock];
-                    temp[threadIdx.x] = cSim.pLincsSolution[constraint];
                    unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block];
                    // Multiply by the matrix.
@@ -163,7 +162,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
                    float sum = 0.0f;
                    for (unsigned int i = 0; i < blockSize; i++)
                        sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock];
-                    cSim.pLincsSolution[constraint] = sum;
+                    cSim.pLincsSolution[firstConstraint+indexInBlock] = sum;
                }
                block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize;
            }
@@ -173,6 +172,7 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
        // Update the position of each atom.
        pos = threadIdx.x + blockIdx.x * blockDim.x;
+        float damping = (iteration < 2 ? 0.5f : 1.0f);
        while (pos < cSim.atoms)
        {
            float4 atomPos = atomPositions[pos];
@@ -182,8 +182,10 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
            {
                int index = pos+i*cSim.atoms;
                int constraint = cSim.pLincsAtomConstraints[index];
-                float constraintForce = invMass*cSim.pLincsSolution[constraint];
+                bool forward = (constraint > 0);
-                constraintForce = (cSim.pLincsAtoms[constraint].x == pos ? constraintForce : -constraintForce);
+                constraint = (forward ? constraint-1 : -constraint-1);
+                float constraintForce = damping*invMass*cSim.pLincsSolution[constraint];
+                constraintForce = (forward ? constraintForce : -constraintForce);
                float4 dir = cSim.pLincsDistance[constraint];
                atomPos.x += constraintForce*dir.x;
                atomPos.y += constraintForce*dir.y;
@@ -202,12 +204,14 @@ __global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
        cSim.pSyncCounter[blockIdx.x] = -1;
 }
-static void initInverseMatrices(gpuContext gpu)
+static void initInverseMatrices(gpuContext gpu, bool useNewPositions)
 {
    // Build the inverse constraint matrix for each cluster.
    gpu->psPosq4->Download();
    gpu->psVelm4->Download();
+    if (useNewPositions)
+        gpu->psPosqP4->Download();
    unsigned int elementIndex = 0;
    for (unsigned int i = 0; i < gpu->sim.rigidClusters; i++) {
        // Compute the constraint coupling matrix for this cluster.
@@ -217,9 +221,24 @@ static void initInverseMatrices(gpuContext gpu)
        unsigned int size = endIndex-startIndex;
        vector<float3> r(size);
        for (unsigned int j = 0; j < size; j++) {
-            int2 atoms = (*gpu->psLincsAtoms)[(*gpu->psRigidClusterConstraints)[startIndex+j]];
+            int2 atoms = (*gpu->psLincsAtoms)[startIndex+j];
-            float4 pos1 = (*gpu->psPosq4)[atoms.x];
+            float4 pos1, pos2;
-            float4 pos2 = (*gpu->psPosq4)[atoms.y];
+            if (useNewPositions) {
+                float4 oldpos1 = (*gpu->psPosq4)[atoms.x];
+                float4 oldpos2 = (*gpu->psPosq4)[atoms.y];
+                pos1 = (*gpu->psPosqP4)[atoms.x];
+                pos2 = (*gpu->psPosqP4)[atoms.y];
+                pos1.x += oldpos1.x;
+                pos1.y += oldpos1.y;
+                pos1.z += oldpos1.z;
+                pos2.x += oldpos2.x;
+                pos2.y += oldpos2.y;
+                pos2.z += oldpos2.z;
+            }
+            else {
+                pos1 = (*gpu->psPosq4)[atoms.x];
+                pos2 = (*gpu->psPosq4)[atoms.y];
+            }
            r[j] = make_float3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
            float invLength = 1.0f/sqrt(r[j].x*r[j].x + r[j].y*r[j].y + r[j].z*r[j].z);
            r[j].x *= invLength;
@@ -228,11 +247,9 @@ static void initInverseMatrices(gpuContext gpu)
        }
        Array2D<double> matrix(size, size);
        for (int j = 0; j < (int)size; j++) {
-            int constraintj = (*gpu->psRigidClusterConstraints)[startIndex+j];
+            int2 atomsj = (*gpu->psLincsAtoms)[startIndex+j];
-            int2 atomsj = (*gpu->psLincsAtoms)[constraintj];
            for (int k = 0; k < (int)size; k++) {
-                int constraintk = (*gpu->psRigidClusterConstraints)[startIndex+k];
+                int2 atomsk = (*gpu->psLincsAtoms)[startIndex+k];
-                int2 atomsk = (*gpu->psLincsAtoms)[constraintk];
                float invMassj0 = (*gpu->psVelm4)[atomsj.x].w;
                float invMassj1 = (*gpu->psVelm4)[atomsj.y].w;
                double dot = r[j].x*r[k].x + r[j].y*r[k].y + r[j].z*r[k].z;
@@ -275,10 +292,10 @@ static void initInverseMatrices(gpuContext gpu)
        (*gpu->psRigidClusterMatrixIndex)[i] = elementIndex;
        for (int j = 0; j < (int)size; j++)
        {
-            float distance1 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+j]].w;
+            float distance1 = (*gpu->psLincsDistance)[startIndex+j].w;
            for (int k = 0; k < (int)size; k++)
            {
-                float distance2 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+k]].w;
+                float distance2 = (*gpu->psLincsDistance)[startIndex+k].w;
                (*gpu->psRigidClusterMatrix)[elementIndex++] = (float)(matrix[k][j]*distance1/distance2);
            }
        }
@@ -286,7 +303,6 @@ static void initInverseMatrices(gpuContext gpu)
    (*gpu->psRigidClusterMatrixIndex)[gpu->sim.rigidClusters] = elementIndex;
    gpu->psRigidClusterMatrix->Upload();
    gpu->psRigidClusterMatrixIndex->Upload();
-    gpu->hasInitializedRigidClusters = true;
 }
 void kApplyFirstCShake(gpuContext gpu)
@@ -295,9 +311,20 @@ void kApplyFirstCShake(gpuContext gpu)
    if (gpu->sim.lincsConstraints > 0)
    {
        if (!gpu->hasInitializedRigidClusters)
-            initInverseMatrices(gpu);
+        {
+            // Build preliminary constraint matrices for use on this call.
+            initInverseMatrices(gpu, false);
+        }
        kApplyCShake_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosqP, true);
        LAUNCHERROR("kApplyCShake");
+        if (!gpu->hasInitializedRigidClusters)
+        {
+            // Rebuild the constraint matrices, now that we know all constraints are really satisfied.
+            initInverseMatrices(gpu, true);
+            gpu->hasInitializedRigidClusters = true;
+        }
    }
 }

--- a/platforms/cuda/src/kernels/kLincs.cu
+++ b/platforms/cuda/src/kernels/kLincs.cu
@@ -101,9 +101,11 @@ __global__ void kSolveLincsMatrix_kernel(float4* atomPositions)
        {
            int index = pos+i*cSim.atoms;
            int constraint = cSim.pLincsAtomConstraints[index];
+            bool forward = (constraint > 0);
+            constraint = (forward ? constraint-1 : -constraint-1);
            float4 dir = cSim.pLincsDistance[constraint];
            float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
-            c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c);
+            c = (forward ? -c : c);
            atomPos.x += c*dir.x;
            atomPos.y += c*dir.y;
            atomPos.z += c*dir.z;