CCMA with a small number of constraints uses a single kernel (#2818)

* CCMA with a small number of constraints uses a single kernel * Fixed compilation errors in kernel * Fixed compilation errors in kernel * Further optimizations to CCMA with few constraints

CCMA with a small number of constraints uses a single kernel (#2818)
* CCMA with a small number of constraints uses a single kernel * Fixed compilation errors in kernel * Fixed compilation errors in kernel * Further optimizations to CCMA with few constraints
b3d98469 · peastman · GitHub · 6d20ff07 · b3d98469 · b3d98469
Unverified Commit b3d98469 authored Sep 03, 2020 by peastman Committed by GitHub Sep 03, 2020
6 changed files
--- a/platforms/common/include/openmm/common/IntegrationUtilities.h
+++ b/platforms/common/include/openmm/common/IntegrationUtilities.h
@@ -136,7 +136,7 @@ protected:
    ComputeKernel settlePosKernel, settleVelKernel;
    ComputeKernel shakePosKernel, shakeVelKernel;
    ComputeKernel ccmaDirectionsKernel, ccmaPosForceKernel, ccmaVelForceKernel;
-    ComputeKernel ccmaMultiplyKernel, ccmaUpdateKernel;
+    ComputeKernel ccmaMultiplyKernel, ccmaUpdateKernel, ccmaFullKernel;
    ComputeKernel vsitePositionKernel, vsiteForceKernel, vsiteSaveForcesKernel;
    ComputeKernel randomKernel, timeShiftKernel;
    ComputeArray posDelta;
@@ -148,6 +148,7 @@ protected:
    ComputeArray randomSeed;
    ComputeArray stepSize;
    ComputeArray ccmaAtoms;
+    ComputeArray ccmaConstraintAtoms;
    ComputeArray ccmaDistance;
    ComputeArray ccmaReducedMass;
    ComputeArray ccmaAtomConstraints;

--- a/platforms/common/src/IntegrationUtilities.cpp
+++ b/platforms/common/src/IntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -36,6 +36,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <map>
+#include <set>
 using namespace OpenMM;
 using namespace std;
@@ -292,6 +293,7 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
    // Record the connections between constraints.
    int numCCMA = (int) ccmaConstraints.size();
+    int numCCMAAtoms = 0;
    if (numCCMA > 0) {
        // Record information needed by ReferenceCCMAAlgorithm.
@@ -354,14 +356,26 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
            for (int j = 0; j < (int)matrix[i].size(); ++j)
                matrix[i][j].first = inverseOrder[matrix[i][j].first];
+        // Make a list of all atoms that involve a CCMA constraint.
+        set<int> ccmaAtomsSet;
+        for (int i = 0; i < numCCMA; i++) {
+            ccmaAtomsSet.insert(atom1[ccmaConstraints[i]]);
+            ccmaAtomsSet.insert(atom2[ccmaConstraints[i]]);
+        }
+        vector<int> ccmaAtomsVec(ccmaAtomsSet.begin(), ccmaAtomsSet.end());
+        sort(ccmaAtomsVec.begin(), ccmaAtomsVec.end());
+        numCCMAAtoms = ccmaAtomsVec.size();
        // Record the CCMA data structures.
-        ccmaAtoms.initialize<mm_int2>(context, numCCMA, "CcmaAtoms");
+        ccmaAtoms.initialize<int>(context, numCCMAAtoms, "ccmaAtoms");
+        ccmaConstraintAtoms.initialize<mm_int2>(context, numCCMA, "ccmaConstraintAtoms");
        ccmaAtomConstraints.initialize<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
        ccmaNumAtomConstraints.initialize<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
        ccmaConstraintMatrixColumn.initialize<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
        ccmaConverged.initialize<int>(context, 2, "ccmaConverged");
-        vector<mm_int2> atomsVec(ccmaAtoms.getSize());
+        vector<mm_int2> atomsVec(ccmaConstraintAtoms.getSize());
        vector<int> atomConstraintsVec(ccmaAtomConstraints.getSize());
        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize());
        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize());
@@ -397,7 +411,8 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
                atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
            }
        }
-        ccmaAtoms.upload(atomsVec);
+        ccmaAtoms.upload(ccmaAtomsVec);
+        ccmaConstraintAtoms.upload(atomsVec);
        ccmaAtomConstraints.upload(atomConstraintsVec);
        ccmaNumAtomConstraints.upload(numAtomConstraintsVec);
        ccmaConstraintMatrixColumn.upload(constraintMatrixColumnVec);
@@ -518,6 +533,7 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
    // Create the kernels used by this class.
    map<string, string> defines;
+    defines["NUM_CCMA_ATOMS"] = context.intToString(numCCMAAtoms);
    defines["NUM_CCMA_CONSTRAINTS"] = context.intToString(numCCMA);
    defines["NUM_ATOMS"] = context.intToString(numAtoms);
    defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
@@ -532,11 +548,12 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
    settleVelKernel = program->createKernel("applySettleToVelocities");
    shakePosKernel = program->createKernel("applyShakeToPositions");
    shakeVelKernel = program->createKernel("applyShakeToVelocities");
-    ccmaDirectionsKernel = program->createKernel("computeCCMAConstraintDirections");
+    ccmaDirectionsKernel = program->createKernel("computeCCMAConstraintDirectionsKernel");
-    ccmaPosForceKernel = program->createKernel("computeCCMAPositionConstraintForce");
+    ccmaPosForceKernel = program->createKernel("computeCCMAPositionConstraintForceKernel");
-    ccmaVelForceKernel = program->createKernel("computeCCMAVelocityConstraintForce");
+    ccmaVelForceKernel = program->createKernel("computeCCMAVelocityConstraintForceKernel");
-    ccmaMultiplyKernel = program->createKernel("multiplyByCCMAConstraintMatrix");
+    ccmaMultiplyKernel = program->createKernel("multiplyByCCMAConstraintMatrixKernel");
-    ccmaUpdateKernel = program->createKernel("updateCCMAAtomPositions");
+    ccmaUpdateKernel = program->createKernel("updateCCMAAtomPositionsKernel");
+    ccmaFullKernel = program->createKernel("runCCMA");
    vsitePositionKernel = program->createKernel("computeVirtualSites");
    vsiteForceKernel = program->createKernel("distributeVirtualSiteForces");
    vsiteSaveForcesKernel = program->createKernel("saveDistributedForces");
@@ -621,14 +638,14 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
        if (context.getUseMixedPrecision())
            shakeVelKernel->addArg(context.getPosqCorrection());
    }
-    if (ccmaAtoms.isInitialized()) {
+    if (ccmaConstraintAtoms.isInitialized()) {
-        ccmaDirectionsKernel->addArg(ccmaAtoms);
+        ccmaDirectionsKernel->addArg(ccmaConstraintAtoms);
        ccmaDirectionsKernel->addArg(ccmaDistance);
        ccmaDirectionsKernel->addArg(context.getPosq());
        ccmaDirectionsKernel->addArg(ccmaConverged);
        if (context.getUseMixedPrecision())
            ccmaDirectionsKernel->addArg(context.getPosqCorrection());
-        ccmaPosForceKernel->addArg(ccmaAtoms);
+        ccmaPosForceKernel->addArg(ccmaConstraintAtoms);
        ccmaPosForceKernel->addArg(ccmaDistance);
        ccmaPosForceKernel->addArg(posDelta);
        ccmaPosForceKernel->addArg(ccmaReducedMass);
@@ -637,7 +654,7 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
        ccmaPosForceKernel->addArg();
        ccmaPosForceKernel->addArg();
        ccmaPosForceKernel->addArg();
-        ccmaVelForceKernel->addArg(ccmaAtoms);
+        ccmaVelForceKernel->addArg(ccmaConstraintAtoms);
        ccmaVelForceKernel->addArg(ccmaDistance);
        ccmaVelForceKernel->addArg(context.getVelm());
        ccmaVelForceKernel->addArg(ccmaReducedMass);
@@ -652,6 +669,7 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
        ccmaMultiplyKernel->addArg(ccmaConstraintMatrixValue);
        ccmaMultiplyKernel->addArg(ccmaConverged);
        ccmaMultiplyKernel->addArg();
+        ccmaUpdateKernel->addArg(ccmaAtoms);
        ccmaUpdateKernel->addArg(ccmaNumAtomConstraints);
        ccmaUpdateKernel->addArg(ccmaAtomConstraints);
        ccmaUpdateKernel->addArg(ccmaDistance);
@@ -661,6 +679,23 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
        ccmaUpdateKernel->addArg(ccmaDelta2);
        ccmaUpdateKernel->addArg(ccmaConverged);
        ccmaUpdateKernel->addArg();
+        ccmaFullKernel->addArg();
+        ccmaFullKernel->addArg(ccmaAtoms);
+        ccmaFullKernel->addArg(ccmaNumAtomConstraints);
+        ccmaFullKernel->addArg(ccmaAtomConstraints);
+        ccmaFullKernel->addArg(ccmaConstraintAtoms);
+        ccmaFullKernel->addArg(ccmaDistance);
+        ccmaFullKernel->addArg(context.getPosq());
+        ccmaFullKernel->addArg(context.getVelm());
+        ccmaFullKernel->addArg(posDelta);
+        ccmaFullKernel->addArg(ccmaReducedMass);
+        ccmaFullKernel->addArg(ccmaDelta1);
+        ccmaFullKernel->addArg(ccmaDelta2);
+        ccmaFullKernel->addArg(ccmaConstraintMatrixColumn);
+        ccmaFullKernel->addArg(ccmaConstraintMatrixValue);
+        ccmaFullKernel->addArg();
+        if (context.getUseMixedPrecision())
+            ccmaFullKernel->addArg(context.getPosqCorrection());
    }
    // Arguments for time shift kernel will be set later.

--- a/platforms/common/src/kernels/integrationUtilities.cc
+++ b/platforms/common/src/kernels/integrationUtilities.cc
@@ -556,8 +556,8 @@ KERNEL void applySettleToVelocities(int numClusters, mixed tol, GLOBAL const rea
 /**
 * Compute the direction each CCMA constraint is pointing in.  This is called once at the beginning of constraint evaluation.
 */
-KERNEL void computeCCMAConstraintDirections(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL mixed4* RESTRICT constraintDistance,
+DEVICE void computeCCMAConstraintDirections(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL mixed4* RESTRICT constraintDistance,
-        GLOBAL const real4* RESTRICT atomPositions, GLOBAL int* RESTRICT converged
+        GLOBAL const real4* RESTRICT atomPositions
 #ifdef USE_MIXED_PRECISION
        , GLOBAL const real4* RESTRICT posqCorrection
 #endif
@@ -577,6 +577,19 @@ KERNEL void computeCCMAConstraintDirections(GLOBAL const int2* RESTRICT constrai
        dir.z = oldPos1.z-oldPos2.z;
        constraintDistance[index] = dir;
    }
+}
+KERNEL void computeCCMAConstraintDirectionsKernel(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL mixed4* RESTRICT constraintDistance,
+        GLOBAL const real4* RESTRICT atomPositions, GLOBAL int* RESTRICT converged
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL const real4* RESTRICT posqCorrection
+#endif
+        ) {
+#ifdef USE_MIXED_PRECISION
+    computeCCMAConstraintDirections(constraintAtoms, constraintDistance, atomPositions, posqCorrection);
+#else
+    computeCCMAConstraintDirections(constraintAtoms, constraintDistance, atomPositions);
+#endif
    if (GLOBAL_ID == 0) {
        converged[0] = 1;
        converged[1] = 0;
@@ -586,19 +599,11 @@ KERNEL void computeCCMAConstraintDirections(GLOBAL const int2* RESTRICT constrai
 /**
 * Compute the force applied by each CCMA position constraint.
 */
-KERNEL void computeCCMAPositionConstraintForce(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
+DEVICE void computeCCMAPositionConstraintForce(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
        GLOBAL const mixed4* RESTRICT atomPositions, GLOBAL const mixed* RESTRICT reducedMass, GLOBAL mixed* RESTRICT delta1,
-        GLOBAL int* RESTRICT converged, GLOBAL int* RESTRICT hostConvergedFlag, mixed tol, int iteration) {
+        mixed tol, int iteration, LOCAL_ARG int* groupConverged) {
-    LOCAL int groupConverged;
-    if (converged[1-iteration%2]) {
-        if (GLOBAL_ID == 0) {
-            converged[iteration%2] = 1;
-            hostConvergedFlag[0] = 1;
-        }
-        return; // The constraint iteration has already converged.
-    }
    if (LOCAL_ID == 0)
-        groupConverged = 1;
+        *groupConverged = 1;
    SYNC_THREADS;
    mixed lowerTol = 1-2*tol+tol*tol;
    mixed upperTol = 1+2*tol+tol*tol;
@@ -620,30 +625,38 @@ KERNEL void computeCCMAPositionConstraintForce(GLOBAL const int2* RESTRICT const
        delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
        threadConverged &= (rp2 > lowerTol*dist2 && rp2 < upperTol*dist2);
    }
-    if (groupConverged && !threadConverged)
+    if (*groupConverged && !threadConverged)
-        groupConverged = 0;
+        *groupConverged = 0;
-    SYNC_THREADS;
-    if (LOCAL_ID == 0 && !groupConverged)
-        converged[iteration%2] = 0;
 }
-/**
+KERNEL void computeCCMAPositionConstraintForceKernel(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
- * Compute the force applied by each CCMA velocity constraint.
- */
-KERNEL void computeCCMAVelocityConstraintForce(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
        GLOBAL const mixed4* RESTRICT atomPositions, GLOBAL const mixed* RESTRICT reducedMass, GLOBAL mixed* RESTRICT delta1,
        GLOBAL int* RESTRICT converged, GLOBAL int* RESTRICT hostConvergedFlag, mixed tol, int iteration) {
    LOCAL int groupConverged;
    if (converged[1-iteration%2]) {
-        if (GROUP_ID == 0 && LOCAL_ID == 0) {
+        if (GLOBAL_ID == 0) {
            converged[iteration%2] = 1;
            hostConvergedFlag[0] = 1;
        }
        return; // The constraint iteration has already converged.
    }
+    computeCCMAPositionConstraintForce(constraintAtoms, constraintDistance, atomPositions, reducedMass,
+            delta1, tol, iteration, &groupConverged);
+    SYNC_THREADS;
+    if (LOCAL_ID == 0 && !groupConverged)
+        converged[iteration%2] = 0;
+}
+/**
+ * Compute the force applied by each CCMA velocity constraint.
+ */
+DEVICE void computeCCMAVelocityConstraintForce(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
+        GLOBAL const mixed4* RESTRICT atomPositions, GLOBAL const mixed* RESTRICT reducedMass, GLOBAL mixed* RESTRICT delta1,
+        mixed tol, int iteration, LOCAL_ARG int* groupConverged) {
    if (LOCAL_ID == 0)
-        groupConverged = 1;
+        *groupConverged = 1;
    SYNC_THREADS;
+    bool threadConverged = true;
    for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
        // Compute the force due to this constraint.
@@ -653,24 +666,34 @@ KERNEL void computeCCMAVelocityConstraintForce(GLOBAL const int2* RESTRICT const
        mixed rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
        mixed d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
        delta1[index] = -2*reducedMass[index]*rrpr/d_ij2;
+        threadConverged &= (fabs(delta1[index]) <= tol);
+    }
+    if (*groupConverged && !threadConverged)
+        *groupConverged = 0;
+}
-        // See whether it has converged.
+KERNEL void computeCCMAVelocityConstraintForceKernel(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
+        GLOBAL const mixed4* RESTRICT atomPositions, GLOBAL const mixed* RESTRICT reducedMass, GLOBAL mixed* RESTRICT delta1,
-        if (groupConverged && fabs(delta1[index]) > tol) {
+        GLOBAL int* RESTRICT converged, GLOBAL int* RESTRICT hostConvergedFlag, mixed tol, int iteration) {
-            groupConverged = 0;
+    LOCAL int groupConverged;
-            converged[iteration%2] = 0;
+    if (converged[1-iteration%2]) {
+        if (GROUP_ID == 0 && LOCAL_ID == 0) {
+            converged[iteration%2] = 1;
+            hostConvergedFlag[0] = 1;
        }
+        return; // The constraint iteration has already converged.
    }
+    computeCCMAVelocityConstraintForce(constraintAtoms, constraintDistance, atomPositions, reducedMass,
+            delta1, tol, iteration, &groupConverged);
+    if (LOCAL_ID == 0 && !groupConverged)
+        converged[iteration%2] = 0;
 }
 /**
 * Multiply the vector of CCMA constraint forces by the constraint matrix.
 */
-KERNEL void multiplyByCCMAConstraintMatrix(GLOBAL const mixed* RESTRICT delta1, GLOBAL mixed* RESTRICT delta2, GLOBAL const int* RESTRICT constraintMatrixColumn,
+DEVICE void multiplyByCCMAConstraintMatrix(GLOBAL const mixed* RESTRICT delta1, GLOBAL mixed* RESTRICT delta2, GLOBAL const int* RESTRICT constraintMatrixColumn,
-        GLOBAL const mixed* RESTRICT constraintMatrixValue, GLOBAL const int* RESTRICT converged, int iteration) {
+        GLOBAL const mixed* RESTRICT constraintMatrixValue, int iteration) {
-    if (converged[iteration%2])
-        return; // The constraint iteration has already converged.
    // Multiply by the inverse constraint matrix.
    for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
@@ -686,20 +709,24 @@ KERNEL void multiplyByCCMAConstraintMatrix(GLOBAL const mixed* RESTRICT delta1,
    }
 }
+KERNEL void multiplyByCCMAConstraintMatrixKernel(GLOBAL const mixed* RESTRICT delta1, GLOBAL mixed* RESTRICT delta2, GLOBAL const int* RESTRICT constraintMatrixColumn,
+        GLOBAL const mixed* RESTRICT constraintMatrixValue, GLOBAL const int* RESTRICT converged, int iteration) {
+    if (converged[iteration%2])
+        return; // The constraint iteration has already converged.
+    multiplyByCCMAConstraintMatrix(delta1, delta2, constraintMatrixColumn, constraintMatrixValue, iteration);
+}
 /**
 * Update the atom positions based on CCMA constraint forces.
 */
-KERNEL void updateCCMAAtomPositions(GLOBAL const int* RESTRICT numAtomConstraints, GLOBAL const int* RESTRICT atomConstraints,
+DEVICE void updateCCMAAtomPositions(GLOBAL const int* RESTRICT atoms, GLOBAL const int* RESTRICT numAtomConstraints, GLOBAL const int* RESTRICT atomConstraints,
        GLOBAL const mixed4* RESTRICT constraintDistance, GLOBAL mixed4* RESTRICT atomPositions, GLOBAL const mixed4* RESTRICT velm,
-        GLOBAL const mixed* RESTRICT delta1, GLOBAL const mixed* RESTRICT delta2, GLOBAL int* RESTRICT converged, int iteration) {
+        GLOBAL const mixed* RESTRICT delta1, GLOBAL const mixed* RESTRICT delta2, int iteration) {
-    if (GROUP_ID == 0 && LOCAL_ID == 0)
-        converged[1-iteration%2] = 1;
-    if (converged[iteration%2])
-        return; // The constraint iteration has already converged.
    mixed damping = (iteration < 2 ? 0.5f : 1.0f);
-    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
+    for (int i = GLOBAL_ID; i < NUM_CCMA_ATOMS; i += GLOBAL_SIZE) {
        // Compute the new position of this atom.
+        int index = atoms[i];
        mixed4 atomPos = atomPositions[index];
        mixed invMass = velm[index].w;
        int num = numAtomConstraints[index];
@@ -718,6 +745,60 @@ KERNEL void updateCCMAAtomPositions(GLOBAL const int* RESTRICT numAtomConstraint
    }
 }
+KERNEL void updateCCMAAtomPositionsKernel(GLOBAL const int* RESTRICT atoms, GLOBAL const int* RESTRICT numAtomConstraints, GLOBAL const int* RESTRICT atomConstraints,
+        GLOBAL const mixed4* RESTRICT constraintDistance, GLOBAL mixed4* RESTRICT atomPositions, GLOBAL const mixed4* RESTRICT velm,
+        GLOBAL const mixed* RESTRICT delta1, GLOBAL const mixed* RESTRICT delta2, GLOBAL int* RESTRICT converged, int iteration) {
+    if (GROUP_ID == 0 && LOCAL_ID == 0)
+        converged[1-iteration%2] = 1;
+    if (converged[iteration%2])
+        return; // The constraint iteration has already converged.
+    updateCCMAAtomPositions(atoms, numAtomConstraints, atomConstraints, constraintDistance, atomPositions, velm,
+            delta1, delta2, iteration);
+}
+/**
+ * Run the entire CCMA iteration within a single kernel.  This has far less overhead than
+ * using multiple kernels, but requires the calculation to use only a single workgroup.
+ * That makes it faster for small numbers of constraints, but slower for large numbers.
+ */
+KERNEL void runCCMA(int constrainVelocities, GLOBAL const int* RESTRICT atoms, GLOBAL const int* RESTRICT numAtomConstraints, GLOBAL const int* RESTRICT atomConstraints,
+        GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL mixed4* RESTRICT constraintDistance, GLOBAL const real4* RESTRICT atomPositions,
+        GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta, GLOBAL const mixed* RESTRICT reducedMass,
+        GLOBAL mixed* RESTRICT delta1, GLOBAL mixed* RESTRICT delta2, GLOBAL const int* RESTRICT constraintMatrixColumn,
+        GLOBAL const mixed* RESTRICT constraintMatrixValue, mixed tol
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL const real4* RESTRICT posqCorrection
+#endif
+        ) {
+    LOCAL int groupConverged;
+#ifdef USE_MIXED_PRECISION
+    computeCCMAConstraintDirections(constraintAtoms, constraintDistance, atomPositions, posqCorrection);
+#else
+    computeCCMAConstraintDirections(constraintAtoms, constraintDistance, atomPositions);
+#endif
+    for (int iteration = 0; iteration < 150; iteration++) {
+        SYNC_THREADS
+        if (constrainVelocities)
+            computeCCMAVelocityConstraintForce(constraintAtoms, constraintDistance, velm, reducedMass,
+                    delta1, tol, iteration, &groupConverged);
+        else
+            computeCCMAPositionConstraintForce(constraintAtoms, constraintDistance, posDelta, reducedMass,
+                    delta1, tol, iteration, &groupConverged);
+        SYNC_THREADS
+        multiplyByCCMAConstraintMatrix(delta1, delta2, constraintMatrixColumn, constraintMatrixValue, iteration);
+        SYNC_THREADS
+        if (constrainVelocities)
+            updateCCMAAtomPositions(atoms, numAtomConstraints, atomConstraints, constraintDistance, velm, velm,
+                    delta1, delta2, iteration);
+        else
+            updateCCMAAtomPositions(atoms, numAtomConstraints, atomConstraints, constraintDistance, posDelta, velm,
+                    delta1, delta2, iteration);
+        SYNC_THREADS
+        if (groupConverged)
+            return;
+    }
+}
 /**
 * Compute the positions of virtual sites
 */

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -91,29 +91,40 @@ void CudaIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, do
            shakeKernel->setArg(1, (float) tol);
        shakeKernel->execute(shakeAtoms.getSize());
    }
-    if (ccmaAtoms.isInitialized()) {
+    if (ccmaConstraintAtoms.isInitialized()) {
-        ccmaForceKernel->setArg(6, ccmaConvergedDeviceMemory);
+        if (ccmaConstraintAtoms.getSize() <= 1024) {
-        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            // Use the version of CCMA that runs in a single kernel with one workgroup.
-            ccmaForceKernel->setArg(7, tol);
+            ccmaFullKernel->setArg(0, (int) constrainVelocities);
-        else
+            if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-            ccmaForceKernel->setArg(7, (float) tol);
+                ccmaFullKernel->setArg(14, tol);
-        ccmaDirectionsKernel->execute(ccmaAtoms.getSize());
+            else
-        const int checkInterval = 4;
+                ccmaFullKernel->setArg(14, (float) tol);
-        ccmaConvergedMemory[0] = 0;
+            ccmaFullKernel->execute(128, 128);
-        ccmaUpdateKernel->setArg(3, constrainVelocities ? context.getVelm() : posDelta);
+        }
-        for (int i = 0; i < 150; i++) {
+        else {
-            ccmaForceKernel->setArg(8, i);
+            ccmaForceKernel->setArg(6, ccmaConvergedDeviceMemory);
-            ccmaForceKernel->execute(ccmaAtoms.getSize());
+            if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-            if ((i+1)%checkInterval == 0)
+                ccmaForceKernel->setArg(7, tol);
-                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
+            else
-            ccmaMultiplyKernel->setArg(5, i);
+                ccmaForceKernel->setArg(7, (float) tol);
-            ccmaMultiplyKernel->execute(ccmaAtoms.getSize());
+            ccmaDirectionsKernel->execute(ccmaConstraintAtoms.getSize());
-            ccmaUpdateKernel->setArg(8, i);
+            const int checkInterval = 4;
-            ccmaUpdateKernel->execute(context.getNumAtoms());
+            ccmaConvergedMemory[0] = 0;
-            if ((i+1)%checkInterval == 0) {
+            ccmaUpdateKernel->setArg(4, constrainVelocities ? context.getVelm() : posDelta);
-                CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
+            for (int i = 0; i < 150; i++) {
-                if (ccmaConvergedMemory[0])
+                ccmaForceKernel->setArg(8, i);
-                    break;
+                ccmaForceKernel->execute(ccmaConstraintAtoms.getSize());
+                if ((i+1)%checkInterval == 0)
+                    CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
+                ccmaMultiplyKernel->setArg(5, i);
+                ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize());
+                ccmaUpdateKernel->setArg(9, i);
+                ccmaUpdateKernel->execute(context.getNumAtoms());
+                if ((i+1)%checkInterval == 0) {
+                    CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
+                    if (ccmaConvergedMemory[0])
+                        break;
+                }
            }
        }
    }

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -76,44 +76,56 @@ void OpenCLIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities,
            shakeKernel->setArg(1, (float) tol);
        shakeKernel->execute(shakeAtoms.getSize());
    }
-    if (ccmaAtoms.isInitialized()) {
+    if (ccmaConstraintAtoms.isInitialized()) {
-        ccmaForceKernel->setArg(6, ccmaConvergedHostBuffer);
+        if (ccmaConstraintAtoms.getSize() <= 1024) {
-        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            // Use the version of CCMA that runs in a single kernel with one workgroup.
-            ccmaForceKernel->setArg(7, tol);
+            ccmaFullKernel->setArg(0, (int) constrainVelocities);
-        else
+            if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-            ccmaForceKernel->setArg(7, (float) tol);
+                ccmaFullKernel->setArg(14, tol);
-        ccmaDirectionsKernel->execute(ccmaAtoms.getSize());
+            else
-        const int checkInterval = 4;
+                ccmaFullKernel->setArg(14, (float) tol);
-        OpenCLContext& cl = dynamic_cast<OpenCLContext&>(context);
+            ccmaFullKernel->execute(128, 128);
-        cl::CommandQueue queue = cl.getQueue();
+        }
-        int* converged = (int*) context.getPinnedBuffer();
+        else {
-        int* ccmaConvergedHostMemory = (int*) queue.enqueueMapBuffer(ccmaConvergedHostBuffer.getDeviceBuffer(), CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_int));
+            // Use the version of CCMA that uses multiple kernels.
-        ccmaConvergedHostMemory[0] = 0;
+            ccmaForceKernel->setArg(6, ccmaConvergedHostBuffer);
-        queue.enqueueUnmapMemObject(ccmaConvergedHostBuffer.getDeviceBuffer(), ccmaConvergedHostMemory);
+            if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-        ccmaUpdateKernel->setArg(3, constrainVelocities ? context.getVelm() : posDelta);
+                ccmaForceKernel->setArg(7, tol);
-        for (int i = 0; i < 150; i++) {
+            else
-            ccmaForceKernel->setArg(8, i);
+                ccmaForceKernel->setArg(7, (float) tol);
-            ccmaForceKernel->execute(ccmaAtoms.getSize());
+            ccmaDirectionsKernel->execute(ccmaConstraintAtoms.getSize());
-            cl::Event event;
+            const int checkInterval = 4;
-            if ((i+1)%checkInterval == 0 && !ccmaUseDirectBuffer)
+            OpenCLContext& cl = dynamic_cast<OpenCLContext&>(context);
-                queue.enqueueReadBuffer(cl.unwrap(ccmaConverged).getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(int), converged, NULL, &event);
+            cl::CommandQueue queue = cl.getQueue();
-            ccmaMultiplyKernel->setArg(5, i);
+            int* converged = (int*) context.getPinnedBuffer();
-            ccmaMultiplyKernel->execute(ccmaAtoms.getSize());
+            int* ccmaConvergedHostMemory = (int*) queue.enqueueMapBuffer(ccmaConvergedHostBuffer.getDeviceBuffer(), CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_int));
-            ccmaUpdateKernel->setArg(8, i);
+            ccmaConvergedHostMemory[0] = 0;
-            ccmaUpdateKernel->execute(context.getNumAtoms());
+            queue.enqueueUnmapMemObject(ccmaConvergedHostBuffer.getDeviceBuffer(), ccmaConvergedHostMemory);
-            if ((i+1)%checkInterval == 0) {
+            ccmaUpdateKernel->setArg(4, constrainVelocities ? context.getVelm() : posDelta);
-                if (ccmaUseDirectBuffer) {
+            for (int i = 0; i < 150; i++) {
-                    ccmaConvergedHostMemory = (int*) queue.enqueueMapBuffer(ccmaConvergedHostBuffer.getDeviceBuffer(), CL_FALSE, CL_MAP_READ, 0, sizeof(cl_int), NULL, &event);
+                ccmaForceKernel->setArg(8, i);
-                    queue.flush();
+                ccmaForceKernel->execute(ccmaConstraintAtoms.getSize());
-                    while (event.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() != CL_COMPLETE)
+                cl::Event event;
-                        ;
+                if ((i+1)%checkInterval == 0 && !ccmaUseDirectBuffer)
-                    converged[i%2] = ccmaConvergedHostMemory[0];
+                    queue.enqueueReadBuffer(cl.unwrap(ccmaConverged).getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(int), converged, NULL, &event);
-                    queue.enqueueUnmapMemObject(ccmaConvergedHostBuffer.getDeviceBuffer(), ccmaConvergedHostMemory);
+                ccmaMultiplyKernel->setArg(5, i);
+                ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize());
+                ccmaUpdateKernel->setArg(9, i);
+                ccmaUpdateKernel->execute(context.getNumAtoms());
+                if ((i+1)%checkInterval == 0) {
+                    if (ccmaUseDirectBuffer) {
+                        ccmaConvergedHostMemory = (int*) queue.enqueueMapBuffer(ccmaConvergedHostBuffer.getDeviceBuffer(), CL_FALSE, CL_MAP_READ, 0, sizeof(cl_int), NULL, &event);
+                        queue.flush();
+                        while (event.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() != CL_COMPLETE)
+                            ;
+                        converged[i%2] = ccmaConvergedHostMemory[0];
+                        queue.enqueueUnmapMemObject(ccmaConvergedHostBuffer.getDeviceBuffer(), ccmaConvergedHostMemory);
+                    }
+                    else
+                        event.wait();
+                    if (converged[i%2])
+                        break;
                }
-                else
-                    event.wait();
-                if (converged[i%2])
-                    break;
            }
        }
    }

--- a/tests/TestVerletIntegrator.h
+++ b/tests/TestVerletIntegrator.h
@@ -227,6 +227,52 @@ void testConstrainedMasslessParticles() {
    ASSERT_EQUAL(0.0, state.getVelocities()[0][0]);
 }
+void testConstrainedChain(int numParticles) {
+    // Create a linear chain of particles with all distances constrained.
+    System system;
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        positions[i] = Vec3(i, 0, 0);
+        if (i > 0) {
+            system.addConstraint(i-1, i, 1.0);
+            Vec3 delta(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+            delta /= sqrt(delta.dot(delta));
+            positions[i] = positions[i-1]+delta;
+        }
+    }
+    VerletIntegrator integrator(0.001);
+    integrator.setConstraintTolerance(1e-5);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocitiesToTemperature(300.0);
+    // Simulate it and see whether the constraints remain satisfied.
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Energy | State::Velocities | State::Forces);
+        for (int j = 0; j < system.getNumConstraints(); ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+        }
+        double energy = state.getPotentialEnergy()+state.getKineticEnergy();
+        if (i == 1)
+            initialEnergy = energy;
+        else if (i > 1)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
 void testInitialTemperature() {
    // Check temperature initialization for a collection of randomly placed particles
    const int numParticles = 50000;
@@ -289,6 +335,8 @@ int main(int argc, char* argv[]) {
        testConstraints();
        testConstrainedClusters();
        testConstrainedMasslessParticles();
+        testConstrainedChain(10);
+        testConstrainedChain(1500);
        testInitialTemperature();
        testForceGroups();
        runPlatformTests();