Continuing implementation of LINCS for Cuda platform

968cb132 · Peter Eastman · 1b9ab82b · 968cb132 · 968cb132 · 968cb132
Commit 968cb132 authored Apr 08, 2009 by Peter Eastman
12 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -372,7 +372,7 @@ static void initializeIntegration(const System& system, CudaPlatform::PlatformDa
        invMass1[i] = 1.0f/mass[particle1Index];
        invMass2[i] = 1.0f/mass[particle2Index];
    }
-    gpuSetShakeParameters(gpu, particle1, particle2, distance, invMass1, invMass2, integrator.getConstraintTolerance());
+    gpuSetConstraintParameters(gpu, particle1, particle2, distance, invMass1, invMass2, integrator.getConstraintTolerance(), 4);
    // Initialize any terms that haven't already been handled by a Force.

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -352,6 +352,7 @@ struct cudaGmxSimulation {
    unsigned int    maxShakeIterations;             // Maximum shake iterations
    unsigned int    degreesOfFreedom;               // Number of degrees of freedom in system
    float           shakeTolerance;                 // Shake tolerance
+    unsigned int    lincsTerms;                     // Number of terms in the matrix expansion for LINCS
    float           InvMassJ;                       // Shake inverse mass for hydrogens
    int*            pNonShakeID;                    // Not Shaking atoms
    int4*           pShakeID;                       // Shake atoms and phase

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -59,22 +59,21 @@ struct ShakeCluster {
    int centralID;
    int peripheralID[3];
    int size;
+    bool valid;
    float distance;
    float centralInvMass, peripheralInvMass;
-    ShakeCluster() {
+    ShakeCluster() : valid(true) {
    }
-    ShakeCluster(int centralID, float invMass) : centralID(centralID), centralInvMass(invMass), size(0) {
+    ShakeCluster(int centralID, float invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
    }
    void addAtom(int id, float dist, float invMass) {
-        if (size == 3)
+        if (size == 3 || (size > 0 && dist != distance) || (size > 0 && invMass != peripheralInvMass))
-            throw OpenMMException("A single atom may only have three constraints");
+            valid = false;
-        if (size > 0 && dist != distance)
+        else {
-            throw OpenMMException("All constraints for a central atom must have the same distance");
+            peripheralID[size++] = id;
-        if (size > 0 && invMass != peripheralInvMass)
+            distance = dist;
-            throw OpenMMException("All constraints for a central atom must have the same mass");
+            peripheralInvMass = invMass;
-        peripheralID[size++] = id;
+        }
-        distance = dist;
-        peripheralInvMass = invMass;
    }
 };
@@ -457,10 +456,26 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie
    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
 }
+void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
+{
+    cluster.valid = false;
+    invalidForShake[cluster.centralID] = true;
+    for (int i = 0; i < cluster.size; i++) {
+        invalidForShake[cluster.peripheralID[i]] = true;
+        map<int, ShakeCluster>::iterator otherCluster = allClusters.find(cluster.peripheralID[i]);
+        if (otherCluster != allClusters.end() && otherCluster->second.valid)
+            markShakeClusterInvalid(otherCluster->second, allClusters, invalidForShake);
+    }
+}
 extern "C"
-void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
+void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
-        const vector<float>& invMass1, const vector<float>& invMass2, float tolerance)
+        const vector<float>& invMass1, const vector<float>& invMass2, float shakeTolerance, unsigned int lincsTerms)
 {
+    // Create a vector for recording which atoms are handled by SHAKE (or SETTLE).
+    vector<bool> isShakeAtom(gpu->natoms, false);
    // Find how many constraints each atom is involved in.
    vector<int> constraintCount(gpu->natoms, 0);
@@ -507,7 +522,7 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
    gpu->psSettleParameter                = psSettleParameter;
    gpu->sim.pSettleParameter             = psSettleParameter->_pDevStream[0];
    gpu->sim.settleConstraints            = settleClusters.size();
-    for (int i = 0; i < settleClusters.size(); i++) {
+      for (int i = 0; i < settleClusters.size(); i++) {
        int atom1 = settleClusters[i];
        int atom2 = settleConstraints[atom1].begin()->first;
        int atom3 = (++settleConstraints[atom1].begin())->first;
@@ -537,6 +552,9 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
        }
        else
            throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
+        isShakeAtom[atom1] = true;
+        isShakeAtom[atom2] = true;
+        isShakeAtom[atom3] = true;
    }
    psSettleID->Upload();
    psSettleParameter->Upload();
@@ -546,11 +564,111 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
    if (gpu->sim.settle_threads_per_block < 1)
        gpu->sim.settle_threads_per_block = 1;
+    // Find clusters consisting of a central atom with up to three peripheral atoms.
+    map<int, ShakeCluster> clusters;
+    vector<bool> invalidForShake(gpu->natoms, false);
+    for (int i = 0; i < atom1.size(); i++) {
+        if (isShakeAtom[atom1[i]])
+            continue; // This is being taken care of with SETTLE.
+        // Determine which is the central atom.
+        bool firstIsCentral;
+        if (constraintCount[atom1[i]] > 1)
+            firstIsCentral = true;
+        else if (constraintCount[atom2[i]] > 1)
+            firstIsCentral = false;
+        else if (atom1[i] < atom2[i])
+            firstIsCentral = true;
+        else
+            firstIsCentral = false;
+        int centralID, peripheralID;
+        float centralInvMass, peripheralInvMass;
+        if (firstIsCentral) {
+            centralID = atom1[i];
+            peripheralID = atom2[i];
+            centralInvMass = invMass1[i];
+            peripheralInvMass = invMass2[i];
+        }
+        else {
+            centralID = atom2[i];
+            peripheralID = atom1[i];
+            centralInvMass = invMass2[i];
+            peripheralInvMass = invMass1[i];
+        }
+        // Add it to the cluster.
+        if (clusters.find(centralID) == clusters.end()) {
+            clusters[centralID] = ShakeCluster(centralID, centralInvMass);
+        }
+        ShakeCluster& cluster = clusters[centralID];
+        cluster.addAtom(peripheralID, distance[i], peripheralInvMass);
+        if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
+            markShakeClusterInvalid(cluster, clusters, invalidForShake);
+            map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
+            if (otherCluster != clusters.end() && otherCluster->second.valid)
+                markShakeClusterInvalid(otherCluster->second, clusters, invalidForShake);
+        }
+    }
+    int validShakeClusters = 0;
+    for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+        ShakeCluster& cluster = iter->second;
+        if (cluster.valid) {
+            cluster.valid = !invalidForShake[cluster.centralID];
+            for (int i = 0; i < cluster.size; i++)
+                if (invalidForShake[cluster.peripheralID[i]])
+                    cluster.valid = false;
+            if (cluster.valid)
+                ++validShakeClusters;
+        }
+    }
+    // Fill in the Cuda streams.
+    CUDAStream<int4>* psShakeID             = new CUDAStream<int4>(validShakeClusters, 1);
+    gpu->psShakeID                          = psShakeID;
+    gpu->sim.pShakeID                       = psShakeID->_pDevStream[0];
+    CUDAStream<float4>* psShakeParameter    = new CUDAStream<float4>(validShakeClusters, 1);
+    gpu->psShakeParameter                   = psShakeParameter;
+    gpu->sim.pShakeParameter                = psShakeParameter->_pDevStream[0];
+    gpu->sim.ShakeConstraints               = validShakeClusters;
+    int index = 0;
+    for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+        const ShakeCluster& cluster = iter->second;
+        if (!cluster.valid)
+            continue;
+        psShakeID->_pSysStream[0][index].x = cluster.centralID;
+        psShakeID->_pSysStream[0][index].y = cluster.peripheralID[0];
+        psShakeID->_pSysStream[0][index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
+        psShakeID->_pSysStream[0][index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
+        psShakeParameter->_pSysStream[0][index].x = cluster.centralInvMass;
+        psShakeParameter->_pSysStream[0][index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
+        psShakeParameter->_pSysStream[0][index].z = cluster.distance*cluster.distance;
+        psShakeParameter->_pSysStream[0][index].w = cluster.peripheralInvMass;
+        isShakeAtom[cluster.centralID] = true;
+        isShakeAtom[cluster.peripheralID[0]] = true;
+        if (cluster.size > 1)
+            isShakeAtom[cluster.peripheralID[1]] = true;
+        if (cluster.size > 2)
+            isShakeAtom[cluster.peripheralID[2]] = true;
+        ++index;
+    }
+    psShakeID->Upload();
+    psShakeParameter->Upload();
+    gpu->sim.shakeTolerance = shakeTolerance;
+    gpu->sim.shake_threads_per_block     = (gpu->sim.ShakeConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    if (gpu->sim.shake_threads_per_block > gpu->sim.max_shake_threads_per_block)
+        gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
+    if (gpu->sim.shake_threads_per_block < 1)
+        gpu->sim.shake_threads_per_block = 1;
    // Find connected constraints for LINCS.
    vector<int> lincsConstraints;
    for (unsigned int i = 0; i < atom1.size(); i++)
-        if (settleConstraints[atom1[i]].size() != 2)
+        if (!isShakeAtom[atom1[i]])
            lincsConstraints.push_back(i);
    vector<vector<int> > atomConstraints(gpu->natoms);
    for (unsigned int i = 0; i < lincsConstraints.size(); i++) {
@@ -606,11 +724,11 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
    CUDAStream<float>* psLincsSolution = new CUDAStream<float>((int) lincsConstraints.size(), 1);
    gpu->psLincsSolution             = psLincsSolution;
    gpu->sim.pLincsSolution          = psLincsSolution->_pDevData;
-    CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(100, 1);
+    CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(2*lincsTerms+2, 1);
    gpu->psSyncCounter                      = psSyncCounter;
    gpu->sim.pSyncCounter                   = psSyncCounter->_pDevData;
    gpu->sim.lincsConstraints = lincsConstraints.size();
-    int index = 0;
+    index = 0;
    for (unsigned int i = 0; i < lincsConstraints.size(); i++) {
        int c = lincsConstraints[i];
        psLincsAtoms->_pSysData[i].x = atom1[c];
@@ -639,92 +757,19 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
    psLincsAtomConstraints->Upload();
    psLincsAtomConstraintsIndex->Upload();
    psSyncCounter->Upload();
+    gpu->sim.lincsTerms = lincsTerms;
    gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
    if (gpu->sim.lincs_threads_per_block > gpu->sim.max_shake_threads_per_block)
        gpu->sim.lincs_threads_per_block = gpu->sim.max_shake_threads_per_block;
    if (gpu->sim.lincs_threads_per_block < 1)
        gpu->sim.lincs_threads_per_block = 1;
-/*
-    // Find clusters consisting of a central atom with up to three peripheral atoms.
-    map<int, ShakeCluster> clusters;
-    for (int i = 0; i < atom1.size(); i++) {
-        if (settleConstraints[atom1[i]].size() == 2)
-            continue; // This is being taken care of with SETTLE.
-        // Determine which is the central atom.
+    gpu->psLincsConnectionsIndex->Download();
+    gpu->psLincsConnections->Download();
-        bool firstIsCentral;
+    gpu->psLincsCoupling->Download();
-        if (constraintCount[atom1[i]] > 1)
-            firstIsCentral = true;
-        else if (constraintCount[atom2[i]] > 1)
-            firstIsCentral = false;
-        else if (atom1[i] < atom2[i])
-            firstIsCentral = true;
-        else
-            firstIsCentral = false;
-        int centralID, peripheralID;
-        float centralInvMass, peripheralInvMass;
-        if (firstIsCentral) {
-            centralID = atom1[i];
-            peripheralID = atom2[i];
-            centralInvMass = invMass1[i];
-            peripheralInvMass = invMass2[i];
-        }
-        else {
-            centralID = atom2[i];
-            peripheralID = atom1[i];
-            centralInvMass = invMass2[i];
-            peripheralInvMass = invMass1[i];
-        }
-        if (constraintCount[peripheralID] != 1)
-            throw OpenMMException("Only bonds to hydrogens may be constrained");
-        // Add it to the cluster.
-        if (clusters.find(centralID) == clusters.end()) {
-            clusters[centralID] = ShakeCluster(centralID, centralInvMass);
-        }
-        clusters[centralID].addAtom(peripheralID, distance[i], peripheralInvMass);
-    }
-    // Fill in the Cuda streams.
-    CUDAStream<int4>* psShakeID             = new CUDAStream<int4>((int) clusters.size(), 1);
-    gpu->psShakeID                          = psShakeID;
-    gpu->sim.pShakeID                       = psShakeID->_pDevStream[0]; 
-    CUDAStream<float4>* psShakeParameter    = new CUDAStream<float4>((int) clusters.size(), 1);
-    gpu->psShakeParameter                   = psShakeParameter;
-    gpu->sim.pShakeParameter                = psShakeParameter->_pDevStream[0];
-    gpu->sim.ShakeConstraints               = clusters.size();
-    int index = 0;
-    for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
-        const ShakeCluster& cluster = iter->second;
-        psShakeID->_pSysStream[0][index].x = cluster.centralID;
-        psShakeID->_pSysStream[0][index].y = cluster.peripheralID[0];
-        psShakeID->_pSysStream[0][index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
-        psShakeID->_pSysStream[0][index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
-        psShakeParameter->_pSysStream[0][index].x = cluster.centralInvMass;
-        psShakeParameter->_pSysStream[0][index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
-        psShakeParameter->_pSysStream[0][index].z = cluster.distance*cluster.distance;
-        psShakeParameter->_pSysStream[0][index].w = cluster.peripheralInvMass;
-        ++index;
-    }
-    psShakeID->Upload();
-    psShakeParameter->Upload();
-    gpu->sim.shakeTolerance = tolerance;
-*/
-    gpu->sim.ShakeConstraints = 0;
-    gpu->sim.shake_threads_per_block     = (gpu->sim.ShakeConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks; 
-    if (gpu->sim.shake_threads_per_block > gpu->sim.max_shake_threads_per_block)
-        gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
-    if (gpu->sim.shake_threads_per_block < 1)
-        gpu->sim.shake_threads_per_block = 1;
 #ifdef DeltaShake
@@ -732,7 +777,7 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
    int count = 0;
    for (int i = 0; i < gpu->natoms; i++)
-//       if (constraintCount[i] == 0)
+       if (!isShakeAtom[i])
          count++;
    // Allocate NonShake parameters
@@ -756,9 +801,9 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
       count = 0;
       for (int i = 0; i < gpu->natoms; i++){
-//          if (constraintCount[i] == 0){
+          if (!isShakeAtom[i]){
             psNonShakeID->_pSysStream[0][count++] = i;
-//          }
+          }
       }
       psNonShakeID->Upload();

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -183,8 +183,8 @@ extern "C"
 void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
 extern "C"
-void gpuSetShakeParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
+void gpuSetConstraintParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
-        const std::vector<float>& invMass1, const std::vector<float>& invMass2, float tolerance);
+        const std::vector<float>& invMass1, const std::vector<float>& invMass2, float shakeTolerance, unsigned int lincsTerms);
 extern "C"
 int gpuAllocateInitialBuffers(gpuContext gpu);

--- a/platforms/cuda/src/kernels/kLincs.cu
+++ b/platforms/cuda/src/kernels/kLincs.cu
@@ -73,9 +73,9 @@ __device__ void kSyncAllThreads_kernel(unsigned int* syncCounter)
    } while (counterValue > 0);
 }
-__device__ void kSolveMatrix_kernel(unsigned int numTerms, unsigned int* syncCounter)
+__device__ void kSolveMatrix_kernel(unsigned int* syncCounter)
 {
-    for (unsigned int iteration = 0; iteration < numTerms; iteration++) {
+    for (unsigned int iteration = 0; iteration < cSim.lincsTerms; iteration++) {
        float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2);
        float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1);
        unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
@@ -121,7 +121,7 @@ __device__ void kUpdateAtomPositions_kernel(float4* atomPositions)
    }
 }
-__global__ void kApplyLincs_kernel(unsigned int numTerms, float4* atomPositions, bool addOldPosition)
+__global__ void kApplyLincs_kernel(float4* atomPositions, bool addOldPosition)
 {
   // Calculate the direction of each constraint, along with the initial RHS and solution vectors.
@@ -174,15 +174,15 @@ __global__ void kApplyLincs_kernel(unsigned int numTerms, float4* atomPositions,
            int otherConstraint = cSim.pLincsConnections[i];
            float4 dir2 = cSim.pLincsDistance[otherConstraint];
            int2 atoms2 = cSim.pLincsAtoms[otherConstraint];
-            float sign = (atoms1.x == atoms2.x || atoms1.y == atoms2.y ? -1.0f : 1.0f);
+            float signedMass = (atoms1.x == atoms2.x || atoms1.y == atoms2.y ? -invMass : cSim.pVelm4[atoms1.y].w);
-            cSim.pLincsCoupling[i] = sign*invMass*s*(dir1.x*dir2.x+dir1.y*dir2.y+dir1.z*dir2.z)*cSim.pLincsS[otherConstraint]; // ***** Is this the correct mass? *****
+            cSim.pLincsCoupling[i] = signedMass*s*(dir1.x*dir2.x+dir1.y*dir2.y+dir1.z*dir2.z)*cSim.pLincsS[otherConstraint];
        }
        pos += blockDim.x * gridDim.x;
    }
    // Solve the matrix equation and update the atom positions.
-    kSolveMatrix_kernel(numTerms, cSim.pSyncCounter+1);
+    kSolveMatrix_kernel(cSim.pSyncCounter+1);
    kUpdateAtomPositions_kernel(atomPositions);
    // Correct for rotational lengthening.
@@ -217,7 +217,7 @@ __global__ void kApplyLincs_kernel(unsigned int numTerms, float4* atomPositions,
    // Solve the matrix equation and update the atom positions.
-    kSolveMatrix_kernel(numTerms, cSim.pSyncCounter+numTerms+1);
+    kSolveMatrix_kernel(cSim.pSyncCounter+cSim.lincsTerms+1);
    kUpdateAtomPositions_kernel(atomPositions);
 }
@@ -226,7 +226,7 @@ void kApplyFirstLincs(gpuContext gpu)
 //    printf("kApplyFirstLincs\n");
    if (gpu->sim.lincsConstraints > 0)
    {
-        kApplyLincs_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(4, gpu->sim.pPosqP, true);
+        kApplyLincs_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosqP, true);
        LAUNCHERROR("kApplyFirstLincs");
    }
 }
@@ -236,7 +236,7 @@ void kApplySecondLincs(gpuContext gpu)
 //    printf("kApplySecondLincs\n");
    if (gpu->sim.lincsConstraints > 0)
    {
-        kApplyLincs_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(4, gpu->sim.pPosq, false);
+        kApplyLincs_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(gpu->sim.pPosq, false);
        LAUNCHERROR("kApplySecondLincs");
    }
 }
--- a/platforms/cuda/tests/TestCudaBrownianIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaBrownianIntegrator.cpp
@@ -126,7 +126,7 @@ void testTemperature() {
 void testConstraints() {
    const int numParticles = 8;
-    const int numConstraints = numParticles/2;
+    const int numConstraints = 5;
    const double temp = 20.0;
    CudaPlatform platform;
    System system(numParticles, numConstraints);
@@ -137,8 +137,11 @@ void testConstraints() {
        system.setParticleMass(i, 10.0);
        forceField->setParticleParameters(i, (i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
    }
-    for (int i = 0; i < numConstraints; ++i)
+    system.setConstraintParameters(0, 0, 1, 1.0);
-        system.setConstraintParameters(i, 2*i, 2*i+1, 1.0);
+    system.setConstraintParameters(1, 1, 2, 1.0);
+    system.setConstraintParameters(2, 2, 3, 1.0);
+    system.setConstraintParameters(3, 4, 5, 1.0);
+    system.setConstraintParameters(4, 6, 7, 1.0);
    system.addForce(forceField);
    OpenMMContext context(system, integrator, platform);
    vector<Vec3> positions(numParticles);

--- a/platforms/cuda/tests/TestCudaLangevinIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaLangevinIntegrator.cpp
@@ -131,7 +131,7 @@ void testTemperature() {
 void testConstraints() {
    const int numParticles = 8;
-    const int numConstraints = 7;
+    const int numConstraints = 5;
    const double temp = 100.0;
    CudaPlatform platform;
    System system(numParticles, numConstraints);
@@ -142,8 +142,11 @@ void testConstraints() {
        system.setParticleMass(i, 10.0);
        forceField->setParticleParameters(i, (i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
    }
-    for (int i = 0; i < numConstraints; ++i)
+    system.setConstraintParameters(0, 0, 1, 1.0);
-        system.setConstraintParameters(i, i, i+1, 1.0);
+    system.setConstraintParameters(1, 1, 2, 1.0);
+    system.setConstraintParameters(2, 2, 3, 1.0);
+    system.setConstraintParameters(3, 4, 5, 1.0);
+    system.setConstraintParameters(4, 6, 7, 1.0);
    system.addForce(forceField);
    OpenMMContext context(system, integrator, platform);
    vector<Vec3> positions(numParticles);
@@ -241,7 +244,7 @@ int main() {
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;
-//        return 1;
+        return 1;
    }
    cout << "Done" << endl;
    return 0;

--- a/platforms/cuda/tests/TestCudaVerletIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaVerletIntegrator.cpp
@@ -87,7 +87,7 @@ void testSingleBond() {
 void testConstraints() {
    const int numParticles = 8;
-    const int numConstraints = numParticles/2;
+    const int numConstraints = 5;
    const double temp = 100.0;
    CudaPlatform platform;
    System system(numParticles, numConstraints);
@@ -98,8 +98,11 @@ void testConstraints() {
        system.setParticleMass(i, 10.0);
        forceField->setParticleParameters(i, (i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
    }
-    for (int i = 0; i < numConstraints; ++i)
+    system.setConstraintParameters(0, 0, 1, 1.0);
-        system.setConstraintParameters(i, 2*i, 2*i+1, 1.0);
+    system.setConstraintParameters(1, 1, 2, 1.0);
+    system.setConstraintParameters(2, 2, 3, 1.0);
+    system.setConstraintParameters(3, 4, 5, 1.0);
+    system.setConstraintParameters(4, 6, 7, 1.0);
    system.addForce(forceField);
    OpenMMContext context(system, integrator, platform);
    vector<Vec3> positions(numParticles);

--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -517,11 +517,10 @@ void ReferenceIntegrateVerletStepKernel::execute(OpenMMContextImpl& context, con
            delete constraints;
        }
        dynamics = new ReferenceVerletDynamics(context.getSystem().getNumParticles(), static_cast<RealOpenMM>(stepSize) );
-        constraints = new ReferenceShakeAlgorithm(numConstraints, constraintIndices, constraintDistances);
+        constraints = new ReferenceShakeAlgorithm(numConstraints, constraintIndices, constraintDistances, integrator.getConstraintTolerance());
        dynamics->setReferenceConstraintAlgorithm(constraints);
        prevStepSize = stepSize;
    }
-//    shake->setTolerance(integrator.getConstraintTolerance());
    dynamics->update(context.getSystem().getNumParticles(), posData, velData, forceData, masses);
 }
@@ -577,14 +576,12 @@ void ReferenceIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, c
 				static_cast<RealOpenMM>(stepSize), 
 				static_cast<RealOpenMM>(tau), 
 				static_cast<RealOpenMM>(temperature) );
-        constraints = new ReferenceLincsAlgorithm(numConstraints, constraintIndices, constraintDistances);
+        constraints = new ReferenceShakeAlgorithm(numConstraints, constraintIndices, constraintDistances, integrator.getConstraintTolerance());
-//        shake = new ReferenceShakeAlgorithm(numConstraints, constraintIndices, shakeParameters);
        dynamics->setReferenceConstraintAlgorithm(constraints);
        prevTemp = temperature;
        prevFriction = friction;
        prevStepSize = stepSize;
    }
-//    shake->setTolerance(integrator.getConstraintTolerance());
    dynamics->update(context.getSystem().getNumParticles(), posData, velData, forceData, masses);
 }
@@ -639,13 +636,12 @@ void ReferenceIntegrateBrownianStepKernel::execute(OpenMMContextImpl& context, c
 				static_cast<RealOpenMM>(stepSize), 
 				static_cast<RealOpenMM>(friction), 
 				static_cast<RealOpenMM>(temperature) );
-        constraints = new ReferenceShakeAlgorithm(numConstraints, constraintIndices, constraintDistances);
+        constraints = new ReferenceShakeAlgorithm(numConstraints, constraintIndices, constraintDistances, integrator.getConstraintTolerance());
        dynamics->setReferenceConstraintAlgorithm(constraints);
        prevTemp = temperature;
        prevFriction = friction;
        prevStepSize = stepSize;
    }
-//    shake->setTolerance(integrator.getConstraintTolerance());
    dynamics->update(context.getSystem().getNumParticles(), posData, velData, forceData, masses);
 }

--- a/platforms/reference/src/SimTKReference/ReferenceLincsAlgorithm.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceLincsAlgorithm.cpp
@@ -57,7 +57,7 @@ ReferenceLincsAlgorithm::ReferenceLincsAlgorithm( int numberOfConstraints,
   _atomIndices                = atomIndices;
   _distance                   = distance;
-   _numTerms                   = 8;
+   _numTerms                   = 4;
   _hasInitialized             = false;
 }

--- a/platforms/reference/src/SimTKReference/ReferenceShakeAlgorithm.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceShakeAlgorithm.cpp
-/* Portions copyright (c) 2006 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2009 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -38,12 +38,14 @@
   @param numberOfConstraints      number of constraints
   @param atomIndices              block of atom indices
   @param shakeParameters          Shake parameters
+   @param tolerance                constraint tolerance
   --------------------------------------------------------------------------------------- */
 ReferenceShakeAlgorithm::ReferenceShakeAlgorithm( int numberOfConstraints,
                                                  int** atomIndices,
-                                                  RealOpenMM* distance ){
+                                                  RealOpenMM* distance,
+                                                  RealOpenMM tolerance){
   // ---------------------------------------------------------------------------------------
@@ -64,7 +66,7 @@ ReferenceShakeAlgorithm::ReferenceShakeAlgorithm( int numberOfConstraints,
   _distance                   = distance;
   _maximumNumberOfIterations  = 15;
-   _tolerance                  = (RealOpenMM) 1.0e-04;
+   _tolerance                  = tolerance;
   _hasInitializedMasses       = false;
   // work arrays

--- a/platforms/reference/src/SimTKReference/ReferenceShakeAlgorithm.h
+++ b/platforms/reference/src/SimTKReference/ReferenceShakeAlgorithm.h
-/* Portions copyright (c) 2006 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2009 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -55,10 +55,11 @@ class ReferenceShakeAlgorithm : public ReferenceConstraintAlgorithm {
         @param numberOfConstraints      number of constraints 
         @param atomIndices              atom indices for contraints
         @param distance                 distances for constraints
+         @param tolerance                constraint tolerance
         --------------------------------------------------------------------------------------- */
-      ReferenceShakeAlgorithm( int numberOfConstraints, int** atomIndices, RealOpenMM* distance );
+      ReferenceShakeAlgorithm( int numberOfConstraints, int** atomIndices, RealOpenMM* distance, RealOpenMM tolerance );
      /**---------------------------------------------------------------------------------------