Completed Cuda implementation of CSHAKE algorithm

f329a047 · Peter Eastman · 4ef9536e · f329a047 · f329a047 · f329a047
Commit f329a047 authored May 13, 2009 by Peter Eastman
7 changed files
--- a/platforms/cuda/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda/sharedTarget/CMakeLists.txt
@@ -13,6 +13,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
    CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include)
    CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src)
 ENDFOREACH(subdir)
+CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
 CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -356,6 +356,9 @@ struct cudaGmxSimulation {
    unsigned int    ShakeConstraints;               // Total number of Shake constraints
    unsigned int    settleConstraints;              // Total number of Settle constraints
    unsigned int    lincsConstraints;               // Total number of LINCS constraints.
+    unsigned int    rigidClusters;                  // Total number of rigid clusters
+    unsigned int    maxRigidClusterSize;            // The size of the largest rigid cluster
+    unsigned int    clusterShakeBlockSize;          // The number of threads to process each rigid cluster
    unsigned int    NonShakeConstraints;            // Total number of NonShake atoms
    unsigned int    maxShakeIterations;             // Maximum shake iterations
    unsigned int    degreesOfFreedom;               // Number of degrees of freedom in system
@@ -391,6 +394,10 @@ struct cudaGmxSimulation {
    short*          pSyncCounter;                   // Used for global thread synchronization
    unsigned int*   pRequiredIterations;            // Used by SHAKE to communicate whether iteration has converged
    float*          pShakeReducedMass;              // The reduced mass for each SHAKE constraint
+    int*            pRigidClusterConstraints;       // The constraints in each rigid cluster
+    float*          pRigidClusterMatrix;            // The inverse constraint matrix for each rigid cluster
+    unsigned int*   pRigidClusterConstraintIndex;   // The index of each cluster in the stream containing cluster constraints.
+    unsigned int*   pRigidClusterMatrixIndex;       // The index of each cluster in the stream containing cluster matrices.
    // Mutable stuff
    float4*         pPosq;                          // Pointer to atom positions and charges

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -40,6 +40,7 @@
 #include <sstream>
 #include <cmath>
 #include <map>
+#include <set>
 #include <algorithm>
 #ifdef WIN32
  #include <windows.h>
@@ -121,8 +122,6 @@ static const float BOLTZ                    =    (RGAS / KILO);            // (k
 #define DUMP_PARAMETERS 0
-#define DeltaShake
 extern "C"
 void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
 {
@@ -458,7 +457,7 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie
    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
 }
-void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
+static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
 {
    cluster.valid = false;
    invalidForShake[cluster.centralID] = true;
@@ -470,6 +469,116 @@ void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allC
    }
 }
+static void findRigidClusters(gpuContext gpu, const vector<int>& firstAtom, const vector<int>& secondAtom, const vector<int>& constraintIndices)
+{
+    vector<map<int, int> > atomConstraints(firstAtom.size());
+    for (int i = 0; i < constraintIndices.size(); i++) {
+        atomConstraints[firstAtom[i]][secondAtom[i]] = constraintIndices[i];
+        atomConstraints[secondAtom[i]][firstAtom[i]] = constraintIndices[i];
+    }
+    vector<vector<int> > rigidClusters;
+    int totalConstraints = 0;
+    int totalMatrixElements = 0;
+    for (int i = 0; i < firstAtom.size(); i++) {
+        if (atomConstraints[i].size() < 2)
+            continue;
+        // Begin by looking for a triangle this atom is part of.
+        set<int> atoms;
+        atoms.insert(i);
+        for (map<int, int>::const_iterator atom1 = atomConstraints[i].begin(); atom1 != atomConstraints[i].end() && atoms.size() == 1; ++atom1) {
+            for (map<int, int>::const_iterator atom2 = atomConstraints[atom1->first].begin(); atom2 != atomConstraints[atom1->first].end(); ++atom2) {
+                if (atomConstraints[i].count(atom2->first) != 0) {
+                    atoms.insert(atom1->first);
+                    atoms.insert(atom2->first);
+                    break;
+                }
+            }
+        }
+        if (atoms.size() == 1)
+            continue;
+        // We have three atoms that are part of a cluster, so look for other atoms we can add.
+        bool done = false;
+        while (!done) {
+           done = true;
+            for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
+                for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2) {
+                    if (atoms.find(atom2->first) != atoms.end())
+                        continue; // This atom is already in the cluster.
+                    // See if this atom is linked to three other atoms in the cluster.
+                    int linkCount = 0;
+                    for (map<int, int>::const_iterator atom3 = atomConstraints[atom2->first].begin(); atom3 != atomConstraints[atom2->first].end(); ++atom3)
+                        if (atoms.find(atom3->first) != atoms.end())
+                            linkCount++;
+                    if (linkCount > 2) {
+                        atoms.insert(atom2->first);
+                        done = false;
+                    }
+                }
+            }
+        }
+        // Record the cluster.
+        vector<int> constraints;
+        for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
+            for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2) {
+                if (*atom1 < atom2->first && atoms.find(atom2->first) != atoms.end())
+                    constraints.push_back(atom2->second);
+            }
+        }
+        rigidClusters.push_back(constraints);
+        totalConstraints += constraints.size();
+        totalMatrixElements += constraints.size()*constraints.size();
+        for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
+            for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2)
+                atomConstraints[atom2->first].erase(*atom1);
+            atomConstraints[*atom1].clear();
+        }
+    }
+    // Build the CUDA streams.
+    CUDAStream<int>* psRigidClusterConstraints = new CUDAStream<int>(totalConstraints, 1, "RigidClusterConstraints");
+    gpu->psRigidClusterConstraints             = psRigidClusterConstraints;
+    gpu->sim.pRigidClusterConstraints          = psRigidClusterConstraints->_pDevData;
+    CUDAStream<unsigned int>* psRigidClusterConstraintIndex = new CUDAStream<unsigned int>((int) rigidClusters.size()+1, 1, "RigidClusterConstraintIndex");
+    gpu->psRigidClusterConstraintIndex                      = psRigidClusterConstraintIndex;
+    gpu->sim.pRigidClusterConstraintIndex                   = psRigidClusterConstraintIndex->_pDevData;
+    CUDAStream<float>* psRigidClusterMatrix = new CUDAStream<float>(totalMatrixElements, 1, "RigidClusterMatrix");
+    gpu->psRigidClusterMatrix       = psRigidClusterMatrix;
+    gpu->sim.pRigidClusterMatrix    = psRigidClusterMatrix->_pDevData;
+    CUDAStream<unsigned int>* psRigidClusterMatrixIndex = new CUDAStream<unsigned int>((int) rigidClusters.size()+1, 1, "RigidClusterMatrixIndex");
+    gpu->psRigidClusterMatrixIndex                      = psRigidClusterMatrixIndex;
+    gpu->sim.pRigidClusterMatrixIndex                   = psRigidClusterMatrixIndex->_pDevData;
+    unsigned int constraintIndex = 0;
+    unsigned int maxClusterSize = 0;
+    for (unsigned int i = 0; i < rigidClusters.size(); i++) {
+        vector<int>& cluster = rigidClusters[i];
+        (*psRigidClusterConstraintIndex)[i] = constraintIndex;
+        for (unsigned int j = 0; j < cluster.size(); j++)
+            (*psRigidClusterConstraints)[constraintIndex++] = cluster[j];
+        if (cluster.size() > maxClusterSize)
+            maxClusterSize = cluster.size();
+    }
+    (*psRigidClusterConstraintIndex)[rigidClusters.size()] = constraintIndex;
+    gpu->sim.rigidClusters = rigidClusters.size();
+    gpu->sim.maxRigidClusterSize = maxClusterSize;
+    gpu->sim.clusterShakeBlockSize = 1;
+    while (gpu->sim.clusterShakeBlockSize < maxClusterSize)
+        gpu->sim.clusterShakeBlockSize *= 2;
+    if (gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize != 0)
+        gpu->sim.lincs_threads_per_block += gpu->sim.clusterShakeBlockSize - gpu->sim.lincs_threads_per_block%gpu->sim.clusterShakeBlockSize;
+    psRigidClusterConstraints->Upload();
+    psRigidClusterConstraintIndex->Upload();
+    gpu->hasInitializedRigidClusters = false;
+}
 extern "C"
 void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
        const vector<float>& invMass1, const vector<float>& invMass2, float shakeTolerance, unsigned int lincsTerms)
@@ -730,7 +839,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    CUDAStream<float>* psLincsSolution = new CUDAStream<float>(numLincs, 1, "LincsSolution");
    gpu->psLincsSolution             = psLincsSolution;
    gpu->sim.pLincsSolution          = psLincsSolution->_pDevData;
-    CUDAStream<short>* psSyncCounter = new CUDAStream<short>(2*gpu->sim.blocks, 1, "SyncCounter");
+    CUDAStream<short>* psSyncCounter = new CUDAStream<short>(3*gpu->sim.blocks, 1, "SyncCounter");
    gpu->psSyncCounter               = psSyncCounter;
    gpu->sim.pSyncCounter            = psSyncCounter->_pDevData;
    CUDAStream<unsigned int>* psRequiredIterations = new CUDAStream<unsigned int>(1, 1, "RequiredIterations");
@@ -774,14 +883,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
        gpu->sim.lincs_threads_per_block = gpu->sim.blocks;
+    // Identify rigid clusters of atoms.
-    gpu->psLincsNumConnections->Download();
+    findRigidClusters(gpu, atom1, atom2, lincsConstraints);
-    gpu->psLincsConnections->Download();
-    gpu->psLincsCoupling->Download();
-#ifdef DeltaShake
    // count number of atoms w/o constraint
@@ -820,7 +924,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    } else {
       gpu->sim.nonshake_threads_per_block           = 0;
    }
-#endif
 }
 extern "C"
@@ -1152,6 +1255,10 @@ void* gpuInit(int numAtoms)
    gpu->psSyncCounter              = NULL;
    gpu->psRequiredIterations       = NULL;
    gpu->psShakeReducedMass         = NULL;
+    gpu->psRigidClusterConstraints  = NULL;
+    gpu->psRigidClusterConstraintIndex = NULL;
+    gpu->psRigidClusterMatrix       = NULL;
+    gpu->psRigidClusterMatrixIndex  = NULL;
    // Initialize output buffer before reading parameters
    gpu->pOutputBufferCounter       = new unsigned int[gpu->sim.paddedNumberOfAtoms];
@@ -1304,6 +1411,10 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psSyncCounter;
    delete gpu->psRequiredIterations;
    delete gpu->psShakeReducedMass;
+    delete gpu->psRigidClusterConstraints;
+    delete gpu->psRigidClusterConstraintIndex;
+    delete gpu->psRigidClusterMatrix;
+    delete gpu->psRigidClusterMatrixIndex;
    if (gpu->cudpp != 0)
        cudppDestroyPlan(gpu->cudpp);

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -80,6 +80,7 @@ struct _gpuContext {
    bool bRecalculateBornRadii;
    bool bOutputBufferPerWarp;
    bool bIncludeGBSA;
+    bool hasInitializedRigidClusters;
    unsigned long seed;
    SM_VERSION sm_version;
    CUDPPHandle cudpp;
@@ -143,6 +144,10 @@ struct _gpuContext {
    CUDAStream<short>* psSyncCounter;       // Used for global thread synchronization
    CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged
    CUDAStream<float>* psShakeReducedMass;  // The reduced mass for each SHAKE constraint
+    CUDAStream<int>* psRigidClusterConstraints; // The constraints in each rigid cluster
+    CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
+    CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
+    CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
 };
 typedef struct _gpuContext *gpuContext;

--- a/platforms/cuda/src/kernels/kCShake.cu
+++ b/platforms/cuda/src/kernels/kCShake.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <cuda.h>
+#include <vector_functions.h>
+#include <vector>
+#include "jama_svd.h"
+#include "gputypes.h"
+using namespace std;
+using TNT::Array2D;
+using JAMA::SVD;
+static __constant__ cudaGmxSimulation cSim;
+void SetCShakeSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetCShakeSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+/**
+ * Synchronize all threads across all blocks.
+ */
+__device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount)
+{
+    __syncthreads();
+    if (threadIdx.x == 0)
+        syncCounter[blockIdx.x] = newCount;
+    if (threadIdx.x < gridDim.x)
+    {
+        volatile short* counter = &syncCounter[threadIdx.x];
+        do
+        {
+        } while (*counter != newCount);
+    }
+    __syncthreads();
+}
+__global__ void kApplyCShake_kernel(float4* atomPositions, bool addOldPosition)
+{
+    extern __shared__ float temp[];
+    // Initialize counters used for monitoring convergence and doing global thread synchronization.
+    __shared__ unsigned int requiredIterations;
+    if (threadIdx.x == 0)
+    {
+        requiredIterations = 0;
+        cSim.pSyncCounter[gridDim.x+blockIdx.x] = -1;
+        cSim.pSyncCounter[2*gridDim.x+blockIdx.x] = -1;
+        cSim.pRequiredIterations[0] = 0;
+    }
+    // Calculate the direction of each constraint.
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.lincsConstraints)
+    {
+        int2 atoms = cSim.pLincsAtoms[pos];
+        float4 dir = cSim.pLincsDistance[pos];
+        float4 oldPos1 = cSim.pOldPosq[atoms.x];
+        float4 oldPos2 = cSim.pOldPosq[atoms.y];
+        dir.x = oldPos1.x-oldPos2.x;
+        dir.y = oldPos1.y-oldPos2.y;
+        dir.z = oldPos1.z-oldPos2.z;
+        cSim.pLincsDistance[pos] = dir;
+        pos += blockDim.x*gridDim.x;
+    }
+    __syncthreads();
+    // Iteratively update the atom positions.
+    unsigned int maxIterations = 150;
+    float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
+    float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
+    for (unsigned int iteration = 0; iteration < maxIterations && iteration == requiredIterations; iteration++)
+    {
+        // Calculate the constraint force for each constraint.
+        pos = threadIdx.x + blockIdx.x * blockDim.x;
+        while (pos < cSim.lincsConstraints)
+        {
+            int2 atoms = cSim.pLincsAtoms[pos];
+            float4 delta1 = atomPositions[atoms.x];
+            float4 delta2 = atomPositions[atoms.y];
+            float4 dir = cSim.pLincsDistance[pos];
+            float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
+            if (addOldPosition)
+            {
+                rp_ij.x += dir.x;
+                rp_ij.y += dir.y;
+                rp_ij.z += dir.z;
+            }
+            float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
+            float dist2 = dir.w*dir.w;
+            float diff = dist2 - rp2;
+            float rrpr  = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
+            float d_ij2  = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
+            float reducedMass = cSim.pShakeReducedMass[pos];
+            cSim.pLincsSolution[pos] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
+            if (requiredIterations == iteration && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
+                requiredIterations = iteration+1;
+            pos += blockDim.x * gridDim.x;
+        }
+        kSyncAllThreads_kernel(cSim.pSyncCounter, iteration);
+        if (threadIdx.x == 0 && requiredIterations > iteration)
+            cSim.pRequiredIterations[0] = requiredIterations;
+        // Multiply by the inverse constraint matrix for each rigid cluster.
+        if (cSim.rigidClusters > 0)
+        {
+            pos = threadIdx.x + blockIdx.x * blockDim.x;
+            unsigned int block = pos/cSim.clusterShakeBlockSize;
+            unsigned int indexInBlock = pos-block*cSim.clusterShakeBlockSize;
+            while (block < cSim.rigidClusters)
+            {
+                unsigned int firstIndex = cSim.pRigidClusterConstraintIndex[block];
+                unsigned int blockSize = cSim.pRigidClusterConstraintIndex[block+1]-firstIndex;
+                if (indexInBlock < blockSize)
+                {
+                    // Load the constraint forces and matrix.
+                    unsigned int constraint = cSim.pRigidClusterConstraints[firstIndex+indexInBlock];
+                    temp[threadIdx.x] = cSim.pLincsSolution[constraint];
+                    unsigned int firstMatrixIndex = cSim.pRigidClusterMatrixIndex[block];
+                    // Multiply by the matrix.
+                    float sum = 0.0f;
+                    for (unsigned int i = 0; i < blockSize; i++)
+                        sum += temp[threadIdx.x-indexInBlock+i]*cSim.pRigidClusterMatrix[firstMatrixIndex+i*blockSize+indexInBlock];
+                    cSim.pLincsSolution[constraint] = sum;
+                }
+                block += (blockDim.x*gridDim.x)/cSim.clusterShakeBlockSize;
+            }
+            kSyncAllThreads_kernel(&cSim.pSyncCounter[gridDim.x], iteration);
+        }
+        // Update the position of each atom.
+        pos = threadIdx.x + blockIdx.x * blockDim.x;
+        while (pos < cSim.atoms)
+        {
+            float4 atomPos = atomPositions[pos];
+            float invMass = cSim.pVelm4[pos].w;
+            int num = cSim.pLincsNumAtomConstraints[pos];
+            for (int i = 0; i < num; i++)
+            {
+                int index = pos+i*cSim.atoms;
+                int constraint = cSim.pLincsAtomConstraints[index];
+                float constraintForce = invMass*cSim.pLincsSolution[constraint];
+                constraintForce = (cSim.pLincsAtoms[constraint].x == pos ? constraintForce : -constraintForce);
+                float4 dir = cSim.pLincsDistance[constraint];
+                atomPos.x += constraintForce*dir.x;
+                atomPos.y += constraintForce*dir.y;
+                atomPos.z += constraintForce*dir.z;
+            }
+            atomPositions[pos] = atomPos;
+            pos += blockDim.x*gridDim.x;
+        }
+        kSyncAllThreads_kernel(&cSim.pSyncCounter[2*gridDim.x], iteration);
+        requiredIterations = cSim.pRequiredIterations[0];
+    }
+    // Reset the initial sync counter to be ready for the next call.
+    if (threadIdx.x == 0)
+        cSim.pSyncCounter[blockIdx.x] = -1;
+}
+static void initInverseMatrices(gpuContext gpu)
+{
+    // Build the inverse constraint matrix for each cluster.
+    gpu->psPosq4->Download();
+    gpu->psVelm4->Download();
+    unsigned int elementIndex = 0;
+    for (unsigned int i = 0; i < gpu->sim.rigidClusters; i++) {
+        // Compute the constraint coupling matrix for this cluster.
+        unsigned int startIndex = (*gpu->psRigidClusterConstraintIndex)[i];
+        unsigned int endIndex = (*gpu->psRigidClusterConstraintIndex)[i+1];
+        unsigned int size = endIndex-startIndex;
+        vector<float3> r(size);
+        for (unsigned int j = 0; j < size; j++) {
+            int2 atoms = (*gpu->psLincsAtoms)[(*gpu->psRigidClusterConstraints)[startIndex+j]];
+            float4 pos1 = (*gpu->psPosq4)[atoms.x];
+            float4 pos2 = (*gpu->psPosq4)[atoms.y];
+            r[j] = make_float3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
+            float invLength = 1.0f/sqrt(r[j].x*r[j].x + r[j].y*r[j].y + r[j].z*r[j].z);
+            r[j].x *= invLength;
+            r[j].y *= invLength;
+            r[j].z *= invLength;
+        }
+        Array2D<double> matrix(size, size);
+        for (unsigned int j = 0; j < size; j++) {
+            int constraintj = (*gpu->psRigidClusterConstraints)[startIndex+j];
+            int2 atomsj = (*gpu->psLincsAtoms)[constraintj];
+            for (unsigned int k = 0; k < size; k++) {
+                int constraintk = (*gpu->psRigidClusterConstraints)[startIndex+k];
+                int2 atomsk = (*gpu->psLincsAtoms)[constraintk];
+                float invMassj0 = (*gpu->psVelm4)[atomsj.x].w;
+                float invMassj1 = (*gpu->psVelm4)[atomsj.y].w;
+                double dot = r[j].x*r[k].x + r[j].y*r[k].y + r[j].z*r[k].z;
+                if (atomsj.x == atomsk.x)
+                    dot *= invMassj0/(invMassj0+invMassj1);
+                else if (atomsj.y == atomsk.y)
+                    dot *= invMassj1/(invMassj0+invMassj1);
+                else if (atomsj.x == atomsk.y)
+                    dot *= -invMassj0/(invMassj0+invMassj1);
+                else if (atomsj.y == atomsk.x)
+                    dot *= -invMassj1/(invMassj0+invMassj1);
+                else
+                    dot = 0.0;
+                matrix[j][k] = dot;
+            }
+            matrix[j][j] = 1.0;
+        }
+        // Invert it using SVD.
+        Array2D<double> u, v;
+        Array1D<double> w;
+        SVD<double> svd(matrix);
+        svd.getU(u);
+        svd.getV(v);
+        svd.getSingularValues(w);
+        double singularValueCutoff = 0.01*w[0];
+        for (unsigned int j = 0; j < size; j++)
+            w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]);
+        for (unsigned int j = 0; j < size; j++) {
+            for (unsigned int k = 0; k < size; k++) {
+                matrix[j][k] = 0.0;
+                for (unsigned int m = 0; m < size; m++)
+                    matrix[j][k] += v[j][m]*w[m]*u[k][m];
+            }
+        }
+        // Record the inverted matrix.
+        (*gpu->psRigidClusterMatrixIndex)[i] = elementIndex;
+        for (unsigned int j = 0; j < size; j++)
+        {
+            float distance1 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+j]].w;
+            for (unsigned int k = 0; k < size; k++)
+            {
+                float distance2 = (*gpu->psLincsDistance)[(*gpu->psRigidClusterConstraints)[startIndex+k]].w;
+                (*gpu->psRigidClusterMatrix)[elementIndex++] = matrix[k][j]*distance1/distance2;
+            }
+        }
+    }
+    (*gpu->psRigidClusterMatrixIndex)[gpu->sim.rigidClusters] = elementIndex;
+    gpu->psRigidClusterMatrix->Upload();
+    gpu->psRigidClusterMatrixIndex->Upload();
+    gpu->hasInitializedRigidClusters = true;
+}
+void kApplyFirstCShake(gpuContext gpu)
+{
+//    printf("kApplyFirstCShake\n");
+    if (gpu->sim.lincsConstraints > 0)
+    {
+        if (!gpu->hasInitializedRigidClusters)
+            initInverseMatrices(gpu);
+        kApplyCShake_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosqP, true);
+        LAUNCHERROR("kApplyCShake");
+    }
+}
+void kApplySecondCShake(gpuContext gpu)
+{
+//    printf("kApplySecondCShake\n");
+    if (gpu->sim.lincsConstraints > 0)
+    {
+        kApplyCShake_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block, 4*gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosq, false);
+        LAUNCHERROR("kApplyCShake");
+    }
+}
--- a/platforms/cuda/staticTarget/CMakeLists.txt
+++ b/platforms/cuda/staticTarget/CMakeLists.txt
@@ -13,6 +13,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
    CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include)
    CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src)
 ENDFOREACH(subdir)
+CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
 CUDA_ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

--- a/platforms/reference/src/SimTKReference/ReferenceRigidShakeAlgorithm.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceRigidShakeAlgorithm.cpp
@@ -317,8 +317,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
   static const RealOpenMM zero        =  0.0;
   static const RealOpenMM one         =  1.0;
   static const RealOpenMM two         =  2.0;
-   static const RealOpenMM three       =  3.0;
-   static const RealOpenMM oneM        = -1.0;
   static const RealOpenMM half        =  0.5;
   static const RealOpenMM epsilon6    = (RealOpenMM) 1.0e-06;
@@ -327,8 +325,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
   // ---------------------------------------------------------------------------------------
-   int numberOfConstraints = getNumberOfConstraints();
   // temp arrays
   RealOpenMM** r_ij                   = _r_ij;
@@ -345,7 +341,6 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
         int atomJ          = _atomIndices[ii][1];
         reducedMasses[ii]  = half/( inverseMasses[atomI] + inverseMasses[atomJ] );
      }
-      vector<double> temp;
      for (unsigned int i = 0; i < _rigidClusters.size(); i++) {
          // Compute the constraint coupling matrix for this cluster.
@@ -364,8 +359,8 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
              r[j][2] *= invLength;
          }
          Array2D<double> matrix(size, size);
-          for (int j = 0; j < (int)size; j++) {
+          for (unsigned int j = 0; j < size; j++) {
-              for (int k = 0; k < (int)size; k++) {
+              for (unsigned int k = 0; k < size; k++) {
                  double dot;
                  int atomj0 = _atomIndices[cluster[j]][0];
                  int atomj1 = _atomIndices[cluster[j]][1];
@@ -395,23 +390,21 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
          svd.getV(v);
          svd.getSingularValues(w);
          double singularValueCutoff = 0.01*w[0];
-          for (int j = 0; j < (int)size; j++)
+          for (unsigned int j = 0; j < size; j++)
              w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]);
-          if (temp.size() < size)
+          for (unsigned int j = 0; j < size; j++) {
-              temp.resize(size);
+              for (unsigned int k = 0; k < size; k++) {
-          for (int j = 0; j < (int)size; j++) {
-              for (int k = 0; k < (int)size; k++) {
                  matrix[j][k] = 0.0;
-                  for (int m = 0; m < (int)size; m++)
+                  for (unsigned int m = 0; m < size; m++)
                      matrix[j][k] += v[j][m]*w[m]*u[k][m];
              }
          }
          // Record the inverted matrix.
-          for (int j = 0; j < (int)size; j++)
+          for (unsigned int j = 0; j < size; j++)
-              for (int k = 0; k < (int)size; k++)
+              for (unsigned int k = 0; k < size; k++)
-                  _matrices[i][j][k] = (RealOpenMM)matrix[j][k];
+                  _matrices[i][j][k] = (RealOpenMM)matrix[j][k]*_distance[cluster[k]]/_distance[cluster[j]];
      }
   }
@@ -478,12 +471,12 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
          unsigned int size = cluster.size();
          if (size > tempForce.size())
              tempForce.resize(size);
-          for (int j = 0; j < (int)size; j++) {
+          for (unsigned int j = 0; j < size; j++) {
              tempForce[j] = zero;
-              for (int k = 0; k < (int)size; k++)
+              for (unsigned int k = 0; k < size; k++)
-                  tempForce[j] += matrix[j][k]*constraintForce[cluster[k]]*_distance[cluster[k]]/_distance[cluster[j]];
+                  tempForce[j] += matrix[j][k]*constraintForce[cluster[k]];
          }
-          for (int j = 0; j < (int)size; j++)
+          for (unsigned int j = 0; j < size; j++)
              constraintForce[cluster[j]] = tempForce[j];
      }
@@ -499,15 +492,6 @@ RealOpenMM damping = one;//(RealOpenMM) (iterations%2 == 0 ? 0.5 : 1.0);
         }
      }
   }
-//   static int sum = 0;
-//   static int count = 0;
-//   sum += iterations;
-//   count++;
-//   if (count == 100) {
-//       printf("%d iterations\n", sum);
-//       sum = 0;
-//       count = 0;
-//   }
   // diagnostics