Began overhaul of CUDA CustomIntegrator in preparation for supporting flow control

59c809c0 · peastman · 44b96f0c · 59c809c0 · 59c809c0 · 59c809c0
Commit 59c809c0 authored Jul 16, 2015 by peastman
8 changed files
--- a/platforms/cpu/include/CompiledExpressionSet.h
+++ b/platforms/cpu/include/CompiledExpressionSet.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2014 Stanford University and the Authors.           *
+ * Portions copyright (c) 2014-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,7 +33,7 @@
 * -------------------------------------------------------------------------- */
 #include "lepton/CompiledExpression.h"
-#include "windowsExportCpu.h"
+#include "windowsExport.h"
 #include <string>
 #include <vector>
@@ -42,7 +42,7 @@ namespace OpenMM {
 /**
 * This class simplifies the management of a set of related CompiledExpressions that share variables.
 */
-class OPENMM_EXPORT_CPU CompiledExpressionSet {
+class OPENMM_EXPORT CompiledExpressionSet {
 public:
    CompiledExpressionSet();
    /**
@@ -60,6 +60,10 @@ public:
     * @param value    the value to set it to
     */
    void setVariable(int index, double value);
+    /**
+     * Get the total number of variables for which indices have been allocated.
+     */
+    int getNumVariables() const;
 private:
    std::vector<Lepton::CompiledExpression*> expressions;
    std::vector<std::string> variables;

--- a/platforms/cpu/src/CompiledExpressionSet.cpp
+++ b/platforms/cpu/src/CompiledExpressionSet.cpp
-/* Portions copyright (c) 2014 Stanford University and Simbios.
+/* Portions copyright (c) 2014-2015 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -21,7 +21,7 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
-#include "CompiledExpressionSet.h"
+#include "openmm/internal/CompiledExpressionSet.h"
 using namespace OpenMM;
 using namespace Lepton;
@@ -54,3 +54,7 @@ void CompiledExpressionSet::setVariable(int index, double value) {
    for (int i = 0; i < (int) variableReferences[index].size(); i++)
        *variableReferences[index][i] = value;
 }
+int CompiledExpressionSet::getNumVariables() const {
+    return variables.size();
+}
--- a/platforms/cpu/include/CpuCustomGBForce.h
+++ b/platforms/cpu/include/CpuCustomGBForce.h
@@ -25,10 +25,10 @@
 #ifndef OPENMM_CPU_CUSTOM_GB_FORCE_H__
 #define OPENMM_CPU_CUSTOM_GB_FORCE_H__
-#include "CompiledExpressionSet.h"
 #include "CpuNeighborList.h"
 #include "lepton/CompiledExpression.h"
 #include "openmm/CustomGBForce.h"
+#include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
 #include <map>

--- a/platforms/cpu/include/CpuCustomManyParticleForce.h
+++ b/platforms/cpu/include/CpuCustomManyParticleForce.h
@@ -27,9 +27,9 @@
 #include "ReferenceForce.h"
 #include "ReferenceBondIxn.h"
-#include "CompiledExpressionSet.h"
 #include "CpuNeighborList.h"
 #include "openmm/CustomManyParticleForce.h"
+#include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
 #include "lepton/CompiledExpression.h"

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -35,6 +35,9 @@
 #include "CudaSort.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
+#include "openmm/internal/CompiledExpressionSet.h"
+#include "openmm/internal/CustomIntegratorUtilities.h"
+#include "lepton/CompiledExpression.h"
 #include <cufft.h>
 namespace OpenMM {
@@ -1213,6 +1216,7 @@ private:
 */
 class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
 public:
+    enum GlobalTargetType {DT, VARIABLE, PARAMETER};
    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), potentialEnergy(NULL),
            kineticEnergy(NULL), uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
@@ -1279,15 +1283,17 @@ public:
    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
 private:
    class ReorderListener;
+    class GlobalTarget;
    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
+    void recordGlobalValue(double value, GlobalTarget target);
    void recordChangedParameters(ContextImpl& context);
    CudaContext& cu;
    double prevStepSize, energy;
    float energyFloat;
    int numGlobalVariables;
-    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters, keNeedsForce;
+    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce;
    mutable bool localValuesAreCurrent;
    CudaArray* globalValues;
    CudaArray* contextParameterValues;
@@ -1303,19 +1309,43 @@ private:
    mutable std::vector<std::vector<double> > localPerDofValuesDouble;
    std::vector<float> contextValuesFloat;
    std::vector<double> contextValuesDouble;
+    std::vector<float> globalValuesFloat;
+    std::vector<double> globalValuesDouble;
+    std::vector<double> initialGlobalVariables;
    std::vector<std::vector<CUfunction> > kernels;
    std::vector<std::vector<std::vector<void*> > > kernelArgs;
    std::vector<void*> kineticEnergyArgs;
    CUfunction randomKernel, kineticEnergyKernel, sumKineticEnergyKernel;
    std::vector<CustomIntegrator::ComputationType> stepType;
+    std::vector<CustomIntegratorUtilities::Comparison> comparisons;
+    std::vector<std::vector<Lepton::CompiledExpression> > globalExpressions;
+    CompiledExpressionSet expressionSet;
+    std::vector<bool> needsGlobals;
    std::vector<bool> needsForces;
    std::vector<bool> needsEnergy;
+    std::vector<bool> computeBothForceAndEnergy;
    std::vector<bool> invalidatesForces;
    std::vector<bool> merged;
-    std::vector<int> forceGroup;
+    std::vector<int> forceGroupFlags;
+    std::vector<int> blockEnd;
    std::vector<int> requiredGaussian;
    std::vector<int> requiredUniform;
+    std::vector<int> stepEnergyVariableIndex;
+    std::vector<int> globalVariableIndex;
+    std::vector<int> parameterVariableIndex;
+    int gaussianVariableIndex, uniformVariableIndex, dtVariableIndex;
    std::vector<std::string> parameterNames;
+    std::vector<GlobalTarget> stepTarget;
+};
+class CudaIntegrateCustomStepKernel::GlobalTarget {
+public:
+    CudaIntegrateCustomStepKernel::GlobalTargetType type;
+    int variableIndex;
+    GlobalTarget() {
+    }
+    GlobalTarget(CudaIntegrateCustomStepKernel::GlobalTargetType type, int variableIndex) : type(type), variableIndex(variableIndex) {
+    }
 };
 /**

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -5696,7 +5696,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    numGlobalVariables = integrator.getNumGlobalVariables();
    int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
-    globalValues = new CudaArray(cu, max(1, numGlobalVariables), elementSize, "globalVariables");
    sumBuffer = new CudaArray(cu, ((3*system.getNumParticles()+3)/4)*4, elementSize, "sumBuffer");
    potentialEnergy = new CudaArray(cu, 1, cu.getEnergyBuffer().getElementSize(), "potentialEnergy");
    kineticEnergy = new CudaArray(cu, 1, elementSize, "kineticEnergy");
@@ -5764,11 +5763,11 @@ string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& vari
    if (energyName != "")
        variables[energyName] = "energy";
    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
-        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(i)+"]";
+        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(globalVariableIndex[i])+"]";
    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
        variables[integrator.getPerDofVariableName(i)] = "perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i);
    for (int i = 0; i < (int) parameterNames.size(); i++)
-        variables[parameterNames[i]] = "params["+cu.intToString(i)+"]";
+        variables[parameterNames[i]] = "globals["+cu.intToString(parameterVariableIndex[i])+"]";
    vector<const TabulatedFunction*> functions;
    vector<pair<string, string> > functionNames;
    return cu.getExpressionUtilities().createExpressions(expressions, variables, functions, functionNames, "temp"+cu.intToString(component)+"_", "double");
@@ -5808,10 +5807,10 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        kernelArgs.resize(integrator.getNumComputations());
        requiredGaussian.resize(integrator.getNumComputations(), 0);
        requiredUniform.resize(integrator.getNumComputations(), 0);
-        needsForces.resize(numSteps, false);
+        needsGlobals.resize(numSteps, false);
-        needsEnergy.resize(numSteps, false);
+        globalExpressions.resize(numSteps);
-        forceGroup.resize(numSteps, -2);
+        stepType.resize(numSteps);
-        invalidatesForces.resize(numSteps, false);
+        stepTarget.resize(numSteps);
        merged.resize(numSteps, false);
        modifiesParameters = false;
        map<string, string> defines;
@@ -5819,24 +5818,40 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
        defines["WORK_GROUP_SIZE"] = cu.intToString(CudaContext::ThreadBlockSize);
        defines["SUM_BUFFER_SIZE"] = "0";
-        defines["SUM_OUTPUT_INDEX"] = "0";
-        // Build a list of all variables that affect the forces, so we can tell which
+        // Record information about all the computation steps.
-        // steps invalidate them.
+        vector<string> variable(numSteps);
-        set<string> affectsForce;
+        vector<int> forceGroup;
-        affectsForce.insert("x");
+        vector<vector<Lepton::ParsedExpression> > expression;
-        for (vector<ForceImpl*>::const_iterator iter = context.getForceImpls().begin(); iter != context.getForceImpls().end(); ++iter) {
+        CustomIntegratorUtilities::analyzeComputations(context, integrator, expression, comparisons, blockEnd, invalidatesForces, needsForces, needsEnergy, computeBothForceAndEnergy, forceGroup);
-            const map<string, double> params = (*iter)->getDefaultParameters();
+        for (int step = 0; step < numSteps; step++) {
-            for (map<string, double>::const_iterator param = params.begin(); param != params.end(); ++param)
+            string expr;
-                affectsForce.insert(param->first);
+            integrator.getComputationStep(step, stepType[step], variable[step], expr);
+            if (stepType[step] == CustomIntegrator::BeginWhileBlock)
+                blockEnd[blockEnd[step]] = step; // Record where to branch back to.
+            if (stepType[step] == CustomIntegrator::ComputeGlobal || stepType[step] == CustomIntegrator::BeginIfBlock || stepType[step] == CustomIntegrator::BeginWhileBlock)
+                for (int i = 0; i < (int) expression[step].size(); i++)
+                    globalExpressions[step].push_back(expression[step][i].createCompiledExpression());
+        }
+        for (int step = 0; step < numSteps; step++) {
+            for (int i = 0; i < (int) globalExpressions[step].size(); i++)
+                expressionSet.registerExpression(globalExpressions[step][i]);
        }
-        // Record information about all the computation steps.
+        // Record the indices for variables in the CompiledExpressionSet.
-        stepType.resize(numSteps);
+        gaussianVariableIndex = expressionSet.getVariableIndex("gaussian");
-        vector<string> variable(numSteps);
+        uniformVariableIndex = expressionSet.getVariableIndex("uniform");
-        vector<Lepton::ParsedExpression> expression(numSteps);
+        dtVariableIndex = expressionSet.getVariableIndex("dt");
+        for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
+            globalVariableIndex.push_back(expressionSet.getVariableIndex(integrator.getGlobalVariableName(i)));
+        for (int i = 0; i < (int) parameterNames.size(); i++)
+            parameterVariableIndex.push_back(expressionSet.getVariableIndex(parameterNames[i]));
+        // Record the variable names and flags for the force and energy in each step.
+        forceGroupFlags.resize(numSteps, -1);
        vector<string> forceGroupName;
        vector<string> energyGroupName;
        for (int i = 0; i < 32; i++) {
@@ -5849,41 +5864,65 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        }
        vector<string> forceName(numSteps, "f");
        vector<string> energyName(numSteps, "energy");
+        stepEnergyVariableIndex.resize(numSteps, expressionSet.getVariableIndex("energy"));
        for (int step = 0; step < numSteps; step++) {
-            string expr;
+            if (needsForces[step] && forceGroup[step] > -1)
-            integrator.getComputationStep(step, stepType[step], variable[step], expr);
+                forceName[step] = forceGroupName[forceGroup[step]];
-            if (expr.size() > 0) {
+            if (needsEnergy[step] && forceGroup[step] > -1) {
-                expression[step] = Lepton::Parser::parse(expr).optimize();
+                energyName[step] = energyGroupName[forceGroup[step]];
-                if (usesVariable(expression[step], "f")) {
+                stepEnergyVariableIndex[step] = expressionSet.getVariableIndex(energyName[step]);
-                    needsForces[step] = true;
+            }
-                    forceGroup[step] = -1;
+            if (forceGroup[step] > -1)
-                }
+                forceGroupFlags[step] = 1<<forceGroup[step];
-                if (usesVariable(expression[step], "energy")) {
+            if (forceGroupFlags[step] == -2 && step > 0)
-                    needsEnergy[step] = true;
+                forceGroupFlags[step] = forceGroupFlags[step-1];
-                    forceGroup[step] = -1;
+            if (forceGroupFlags[step] != -2 && savedForces.find(forceGroupFlags[step]) == savedForces.end())
-                }
+                savedForces[forceGroupFlags[step]] = new CudaArray(cu, cu.getForce().getSize(), cu.getForce().getElementSize(), "savedForces");
-                for (int i = 0; i < 32; i++) {
+        }
-                    if (usesVariable(expression[step], forceGroupName[i])) {
-                        if (forceGroup[step] != -2)
+        // Allocate space for storing global values, both on the host and the device.
-                            throw OpenMMException("A single computation step cannot depend on multiple force groups");
-                        needsForces[step] = true;
+        globalValuesFloat.resize(expressionSet.getNumVariables());
-                        forceGroup[step] = 1<<i;
+        globalValuesDouble.resize(expressionSet.getNumVariables());
-                        forceName[step] = forceGroupName[i];
+        int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
-                    }
+        globalValues = new CudaArray(cu, expressionSet.getNumVariables(), elementSize, "globalValues");
-                    if (usesVariable(expression[step], energyGroupName[i])) {
+        for (int i = 0; i < integrator.getNumGlobalVariables(); i++) {
-                        if (forceGroup[step] != -2)
+            globalValuesDouble[globalVariableIndex[i]] = initialGlobalVariables[i];
-                            throw OpenMMException("A single computation step cannot depend on multiple force groups");
+            expressionSet.setVariable(globalVariableIndex[i], initialGlobalVariables[i]);
-                        needsEnergy[step] = true;
+        }
-                        forceGroup[step] = 1<<i;
+        for (int i = 0; i < (int) parameterVariableIndex.size(); i++) {
-                        energyName[step] = energyGroupName[i];
+            double value = context.getParameter(parameterNames[i]);
-                    }
+            globalValuesDouble[parameterVariableIndex[i]] = value;
-                }
+            expressionSet.setVariable(parameterVariableIndex[i], value);
+        }
+        // Record information about the targets of steps that will be stored in global variables.
+        for (int step = 0; step < numSteps; step++) {
+            if (stepType[step] == CustomIntegrator::ComputeGlobal || stepType[step] == CustomIntegrator::ComputeSum) {
+                if (variable[step] == "dt")
+                    stepTarget[step].type = DT;
+                for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
+                    if (variable[step] == integrator.getGlobalVariableName(i))
+                        stepTarget[step].type = VARIABLE;
+                for (int i = 0; i < (int) parameterNames.size(); i++)
+                    if (variable[step] == parameterNames[i])
+                        stepTarget[step].type = PARAMETER;
+                stepTarget[step].variableIndex = expressionSet.getVariableIndex(variable[step]);
+            }
+        }
+        // Identify which per-DOF steps are going to require global variables or context parameters.
+        for (int step = 0; step < numSteps; step++) {
+            if (stepType[step] == CustomIntegrator::ComputePerDof || stepType[step] == CustomIntegrator::ComputeSum) {
+                for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
+                    if (usesVariable(expression[step][0], integrator.getGlobalVariableName(i)))
+                        needsGlobals[step] = true;
+                for (int i = 0; i < (int) parameterNames.size(); i++)
+                    if (usesVariable(expression[step][0], parameterNames[i]))
+                        needsGlobals[step] = true;
            }
-            invalidatesForces[step] = (stepType[step] == CustomIntegrator::ConstrainPositions || affectsForce.find(variable[step]) != affectsForce.end());
-            if (forceGroup[step] == -2 && step > 0)
-                forceGroup[step] = forceGroup[step-1];
-            if (forceGroup[step] != -2 && savedForces.find(forceGroup[step]) == savedForces.end())
-                savedForces[forceGroup[step]] = new CudaArray(cu, cu.getForce().getSize(), cu.getForce().getElementSize(), "savedForces");
        }
        // Determine how each step will represent the position (as just a value, or a value plus a delta).
@@ -5911,9 +5950,6 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        for (int step = 1; step < numSteps; step++) {
            if (needsForces[step] || needsEnergy[step])
                continue;
-            if (stepType[step-1] == CustomIntegrator::ComputeGlobal && stepType[step] == CustomIntegrator::ComputeGlobal &&
-                    !usesVariable(expression[step], "uniform") && !usesVariable(expression[step], "gaussian"))
-                merged[step] = true;
            if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof)
                merged[step] = true;
        }
@@ -5933,15 +5969,15 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                }
                int numGaussian = 0, numUniform = 0;
                for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
-                    numGaussian += numAtoms*usesVariable(expression[j], "gaussian");
+                    numGaussian += numAtoms*usesVariable(expression[j][0], "gaussian");
-                    numUniform += numAtoms*usesVariable(expression[j], "uniform");
+                    numUniform += numAtoms*usesVariable(expression[j][0], "uniform");
                    compute << "{\n";
                    if (numGaussian > 0)
                        compute << "float4 gaussian = gaussianValues[gaussianIndex+index];\n";
                    if (numUniform > 0)
                        compute << "float4 uniform = uniformValues[uniformIndex+index];\n";
                    for (int i = 0; i < 3; i++)
-                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]);
+                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j][0], i, integrator, forceName[j], energyName[j]);
                    if (variable[j] == "x") {
                        if (storePosAsDelta[j])
                            compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(loadPos(posq, posqCorrection, index)));\n";
@@ -6010,14 +6046,12 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                    bool found = false;
                    for (int j = 0; j < integrator.getNumGlobalVariables() && !found; j++)
                        if (variable[step] == integrator.getGlobalVariableName(j)) {
-                            args2.push_back(&globalValues->getDevicePointer());
+                            args2.push_back(&kineticEnergy->getDevicePointer());
-                            defines["SUM_OUTPUT_INDEX"] = cu.intToString(j);
                            found = true;
                        }
                    for (int j = 0; j < (int) parameterNames.size() && !found; j++)
                        if (variable[step] == parameterNames[j]) {
-                            args2.push_back(&contextParameterValues->getDevicePointer());
+                            args2.push_back(&kineticEnergy->getDevicePointer());
-                            defines["SUM_OUTPUT_INDEX"] = cu.intToString(j);
                            found = true;
                            modifiesParameters = true;
                        }
@@ -6035,7 +6069,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                stringstream compute;
                for (int i = step; i < numSteps && (i == step || merged[i]); i++)
-                    compute << "{\n" << createGlobalComputation(variable[i], expression[i], integrator, energyName[i]) << "}\n";
+                    compute << "{\n" << createGlobalComputation(variable[i], expression[i][0], integrator, energyName[i]) << "}\n";
                map<string, string> replacements;
                replacements["COMPUTE_STEP"] = compute.str();
                CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorGlobal, replacements), defines);
@@ -6111,7 +6145,6 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
            args << ", " << buffer.getType() << "* __restrict__ " << valueName;
        }
        replacements["PARAMETER_ARGUMENTS"] = args.str();
-        defines["SUM_OUTPUT_INDEX"] = "0";
        defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
        if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
            defines.erase("LOAD_POS_AS_DELTA");
@@ -6144,7 +6177,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        sumKineticEnergyKernel = cu.getKernel(module, useDouble ? "computeDoubleSum" : "computeFloatSum");
    }
-    // Make sure all values (variables, parameters, etc.) stored on the device are up to date.
+    // Make sure all values (variables, parameters, etc.) are up to date.
    if (!deviceValuesAreCurrent) {
        if (useDouble)
@@ -6156,38 +6189,14 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
    localValuesAreCurrent = false;
    double stepSize = integrator.getStepSize();
    if (stepSize != prevStepSize) {
-        if (useDouble) {
+        recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
-            double size[] = {0, stepSize};
-            integration.getStepSize().upload(size);
-        }
-        else {
-            float size[] = {0, (float) stepSize};
-            integration.getStepSize().upload(size);
-        }
-        prevStepSize = stepSize;
    }
-    bool paramsChanged = false;
+    for (int i = 0; i < (int) parameterNames.size(); i++) {
-    if (useDouble) {
+        double value = context.getParameter(parameterNames[i]);
-        for (int i = 0; i < (int) parameterNames.size(); i++) {
+        if (value != globalValuesDouble[parameterVariableIndex[i]]) {
-            double value = context.getParameter(parameterNames[i]);
+            globalValuesDouble[parameterVariableIndex[i]] = value;
-            if (value != contextValuesDouble[i]) {
+            deviceGlobalsAreCurrent = false;
-                contextValuesDouble[i] = value;
-                paramsChanged = true;
-            }
        }
-        if (paramsChanged)
-            contextParameterValues->upload(contextValuesDouble);
-    }
-    else {
-        for (int i = 0; i < (int) parameterNames.size(); i++) {
-            float value = (float) context.getParameter(parameterNames[i]);
-            if (value != contextValuesFloat[i]) {
-                contextValuesFloat[i] = value;
-                paramsChanged = true;
-            }
-        }
-        if (paramsChanged)
-            contextParameterValues->upload(contextValuesFloat);
    }
 }
@@ -6204,7 +6213,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
    for (int i = 0; i < numSteps; i++) {
        int lastForceGroups = context.getLastForceGroups();
-        if ((needsForces[i] || needsEnergy[i]) && (!forcesAreValid || lastForceGroups != forceGroup[i])) {
+        if ((needsForces[i] || needsEnergy[i]) && (!forcesAreValid || lastForceGroups != forceGroupFlags[i])) {
            if (forcesAreValid && savedForces.find(lastForceGroups) != savedForces.end()) {
                // The forces are still valid.  We just need a different force group right now.  Save the old
                // forces in case we need them again.
@@ -6218,31 +6227,31 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
            // Recompute forces and/or energy.  Figure out what is actually needed
            // between now and the next time they get invalidated again.
-            bool computeForce = false, computeEnergy = false;
+            bool computeForce = (needsForces[i] || computeBothForceAndEnergy[i]);
-            for (int j = i; ; j++) {
+            bool computeEnergy = (needsEnergy[i] || computeBothForceAndEnergy[i]);
-                if (needsForces[j])
+            if (!computeEnergy && validSavedForces.find(forceGroupFlags[i]) != validSavedForces.end()) {
-                    computeForce = true;
-                if (needsEnergy[j])
-                    computeEnergy = true;
-                if (invalidatesForces[j])
-                    break;
-                if (j == numSteps-1)
-                    j = -1;
-                if (j == i-1)
-                    break;
-            }
-            if (!computeEnergy && validSavedForces.find(forceGroup[i]) != validSavedForces.end()) {
                // We can just restore the forces we saved earlier.
-                savedForces[forceGroup[i]]->copyTo(cu.getForce());
+                savedForces[forceGroupFlags[i]]->copyTo(cu.getForce());
            }
            else {
                recordChangedParameters(context);
-                energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
+                energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroupFlags[i]);
                energyFloat = (float) energy;
            }
            forcesAreValid = true;
        }
+        if (needsGlobals[i] && !deviceGlobalsAreCurrent) {
+            // Upload the global values to the device.
+            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
+                globalValues->upload(globalValuesDouble);
+            else {
+                for (int j = 0; j < (int) globalValuesDouble.size(); j++)
+                    globalValuesFloat[j] = (float) globalValuesDouble[j];
+                globalValues->upload(globalValuesFloat);
+            }
+        }
        if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
            int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
            kernelArgs[i][0][1] = &posCorrection;
@@ -6253,12 +6262,11 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
                cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
        }
-        else if (stepType[i] == CustomIntegrator::ComputeGlobal && !merged[i]) {
+        else if (stepType[i] == CustomIntegrator::ComputeGlobal) {
-            float uniform = SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber();
+            expressionSet.setVariable(uniformVariableIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
-            float gauss = SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+            expressionSet.setVariable(gaussianVariableIndex, SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
-            kernelArgs[i][0][3] = &uniform;
+            expressionSet.setVariable(stepEnergyVariableIndex[i], energy);
-            kernelArgs[i][0][4] = &gauss;
+            recordGlobalValue(globalExpressions[i][0].evaluate(), stepTarget[i]);
-            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], 1, 1);
        }
        else if (stepType[i] == CustomIntegrator::ComputeSum) {
            int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
@@ -6271,6 +6279,16 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
            cu.clearBuffer(*sumBuffer);
            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
            cu.executeKernel(kernels[i][1], &kernelArgs[i][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
+            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+                double value;
+                kineticEnergy->download(&value);
+                globalValuesDouble[stepTarget[i].variableIndex] = value;
+            }
+            else {
+                float value;
+                kineticEnergy->download(&value);
+                globalValuesDouble[stepTarget[i].variableIndex] = value;
+            }
        }
        else if (stepType[i] == CustomIntegrator::UpdateContextState) {
            recordChangedParameters(context);
@@ -6335,52 +6353,63 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
    }
 }
+void CudaIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target) {
+    switch (target.type) {
+        case DT:
+            globalValuesDouble[dtVariableIndex] = value;
+            deviceGlobalsAreCurrent = false;
+            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+                double size[] = {0, value};
+                cu.getIntegrationUtilities().getStepSize().upload(size);
+            }
+            else {
+                float size[] = {0, (float) value};
+                cu.getIntegrationUtilities().getStepSize().upload(size);
+            }
+            prevStepSize = value;
+            break;
+        case VARIABLE:
+        case PARAMETER:
+            expressionSet.setVariable(target.variableIndex, value);
+            globalValuesDouble[target.variableIndex] = value;
+            deviceGlobalsAreCurrent = false;
+            break;
+    }
+}
 void CudaIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
    if (!modifiesParameters)
        return;
-    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+    for (int i = 0; i < (int) parameterNames.size(); i++) {
-        contextParameterValues->download(contextValuesDouble);
+        double value = context.getParameter(parameterNames[i]);
-        for (int i = 0; i < (int) parameterNames.size(); i++) {
+        if (value != globalValuesDouble[parameterVariableIndex[i]])
-            double value = context.getParameter(parameterNames[i]);
+            context.setParameter(parameterNames[i], globalValuesDouble[parameterVariableIndex[i]]);
-            if (value != contextValuesDouble[i])
-                context.setParameter(parameterNames[i], contextValuesDouble[i]);
-        }
-    }
-    else {
-        contextParameterValues->download(contextValuesFloat);
-        for (int i = 0; i < (int) parameterNames.size(); i++) {
-            float value = (float) context.getParameter(parameterNames[i]);
-            if (value != contextValuesFloat[i])
-                context.setParameter(parameterNames[i], contextValuesFloat[i]);
-        }
    }
 }
 void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vector<double>& values) const {
-    values.resize(numGlobalVariables);
+    if (globalValues == NULL) {
-    if (numGlobalVariables == 0)
+        // The data structures haven't been created yet, so just return the list of values that was given earlier.
-        return;
-    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
+        values = initialGlobalVariables;
-        globalValues->download(values);
-    else {
-        vector<float> buffer;
-        globalValues->download(buffer);
-        for (int i = 0; i < numGlobalVariables; i++)
-            values[i] = buffer[i];
    }
+    values.resize(numGlobalVariables);
+    for (int i = 0; i < numGlobalVariables; i++)
+        values[i] = globalValuesDouble[globalVariableIndex[i]];
 }
 void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
    if (numGlobalVariables == 0)
        return;
-    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
+    if (globalValues == NULL) {
-        globalValues->upload(values);
+        // The data structures haven't been created yet, so just store the list of values.
-    else {
-        vector<float> buffer(numGlobalVariables);
+        initialGlobalVariables = values;
-        for (int i = 0; i < numGlobalVariables; i++)
+        return;
-            buffer[i] = (float) values[i];
-        globalValues->upload(buffer);
    }
+    for (int i = 0; i < numGlobalVariables; i++)
+        globalValuesDouble[globalVariableIndex[i]] = values[i];
+    deviceGlobalsAreCurrent = false;
 }
 void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {

--- a/platforms/cuda/src/kernels/customIntegrator.cu
+++ b/platforms/cuda/src/kernels/customIntegrator.cu
@@ -11,7 +11,7 @@ extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer,
            tempBuffer[thread] += tempBuffer[thread+i];
    }
    if (thread == 0)
-        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
+        *result = tempBuffer[0];
 }
 extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer, double* result) {
@@ -27,7 +27,7 @@ extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer
            tempBuffer[thread] += tempBuffer[thread+i];
    }
    if (thread == 0)
-        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
+        *result = tempBuffer[0];
 }
 extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {

--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -224,7 +224,6 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
        mixed4 xpj2 = make_mixed4(0);
        float invMassCentral = params.x;
        float avgMass = params.y;
-        float d2 = params.z;
        float invMassPeripheral = params.w;
        if (atoms.z != -1) {
            pos2 = loadPos(oldPos, posCorrection, atoms.z);
@@ -245,9 +244,6 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
        mixed rij1sq = rij1.x*rij1.x + rij1.y*rij1.y + rij1.z*rij1.z;
        mixed rij2sq = rij2.x*rij2.x + rij2.y*rij2.y + rij2.z*rij2.z;
        mixed rij3sq = rij3.x*rij3.x + rij3.y*rij3.y + rij3.z*rij3.z;
-        mixed ld1 = d2-rij1sq;
-        mixed ld2 = d2-rij2sq;
-        mixed ld3 = d2-rij3sq;
        // Iterate until convergence.
@@ -605,8 +601,6 @@ extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __rest
    if (threadIdx.x == 0)
        groupConverged = 1;
    __syncthreads();
-    mixed lowerTol = 1-2*tol+tol*tol;
-    mixed upperTol = 1+2*tol+tol*tol;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the force due to this constraint.