Converted more code to common platform (#3073)

* Converted more code to common platform * Converted more code to common platform

Converted more code to common platform (#3073)
* Converted more code to common platform * Converted more code to common platform
98d81730 · Peter Eastman · GitHub · 72c70cfe · 98d81730 · 98d81730
Unverified Commit 98d81730 authored Mar 19, 2021 by Peter Eastman Committed by GitHub Mar 19, 2021
20 changed files
--- a/platforms/common/include/openmm/common/CommonKernels.h
+++ b/platforms/common/include/openmm/common/CommonKernels.h
@@ -35,10 +35,68 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/CustomIntegratorUtilities.h"
 #include "lepton/CompiledExpression.h"
+#include "lepton/ExpressionProgram.h"

 namespace OpenMM {


+/**
+ * This kernel modifies the positions of particles to enforce distance constraints.
+ */
+class CommonApplyConstraintsKernel : public ApplyConstraintsKernel {
+public:
+    CommonApplyConstraintsKernel(std::string name, const Platform& platform, ComputeContext& cc) : ApplyConstraintsKernel(name, platform),
+            cc(cc), hasInitializedKernel(false) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * Update particle positions to enforce constraints.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param tol        the distance tolerance within which constraints must be satisfied.
+     */
+    void apply(ContextImpl& context, double tol);
+    /**
+     * Update particle velocities to enforce constraints.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param tol        the velocity tolerance within which constraints must be satisfied.
+     */
+    void applyToVelocities(ContextImpl& context, double tol);
+private:
+    ComputeContext& cc;
+    bool hasInitializedKernel;
+    ComputeKernel applyDeltasKernel;
+};
+
+/**
+ * This kernel recomputes the positions of virtual sites.
+ */
+class CommonVirtualSitesKernel : public VirtualSitesKernel {
+public:
+    CommonVirtualSitesKernel(std::string name, const Platform& platform, ComputeContext& cc) : VirtualSitesKernel(name, platform), cc(cc) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * Compute the virtual site locations.
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    void computePositions(ContextImpl& context);
+private:
+    ComputeContext& cc;
+};
+
 /**
 * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
 */
@@ -832,6 +890,65 @@ private:
    ComputeEvent event;
 };

+/**
+ * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CommonCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
+public:
+    CommonCalcCustomCVForceKernel(std::string name, const Platform& platform, ComputeContext& cc) : CalcCustomCVForceKernel(name, platform),
+            cc(cc), hasInitializedListeners(false) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomCVForce this kernel will be used for
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
+    /**
+     * Copy state information to the inner context.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void copyState(ContextImpl& context, ContextImpl& innerContext);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomCVForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomCVForce& force);
+    /**
+     * Get the ComputeContext corresponding to the inner Context.
+     */
+    virtual ComputeContext& getInnerComputeContext(ContextImpl& innerContext) = 0;
+private:
+    class ForceInfo;
+    class ReorderListener;
+    ComputeContext& cc;
+    bool hasInitializedListeners;
+    Lepton::ExpressionProgram energyExpression;
+    std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
+    std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
+    std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
+    std::vector<ComputeArray> cvForces;
+    ComputeArray invAtomOrder;
+    ComputeArray innerInvAtomOrder;
+    ComputeKernel copyStateKernel, copyForcesKernel, addForcesKernel;
+};
+
 /**
 * This kernel is invoked by VerletIntegrator to take one time step.
 */
@@ -1399,6 +1516,52 @@ private:
    ComputeKernel kernel;
 };

+/**
+ * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
+ */
+class CommonApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
+public:
+    CommonApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, ComputeContext& cc) : ApplyMonteCarloBarostatKernel(name, platform), cc(cc),
+            hasInitializedKernels(false) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param barostat   the MonteCarloBarostat this kernel will be used for
+     */
+    void initialize(const System& system, const Force& barostat);
+    /**
+     * Attempt a Monte Carlo step, scaling particle positions (or cluster centers) by a specified value.
+     * This version scales the x, y, and z positions independently.
+     * This is called BEFORE the periodic box size is modified.  It should begin by translating each particle
+     * or cluster into the first periodic box, so that coordinates will still be correct after the box size
+     * is changed.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param scaleX     the scale factor by which to multiply particle x-coordinate
+     * @param scaleY     the scale factor by which to multiply particle y-coordinate
+     * @param scaleZ     the scale factor by which to multiply particle z-coordinate
+     */
+    void scaleCoordinates(ContextImpl& context, double scaleX, double scaleY, double scaleZ);
+    /**
+     * Reject the most recent Monte Carlo step, restoring the particle positions to where they were before
+     * scaleCoordinates() was last called.
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    void restoreCoordinates(ContextImpl& context);
+private:
+    ComputeContext& cc;
+    bool hasInitializedKernels;
+    int numMolecules;
+    ComputeArray savedPositions, savedFloatForces, savedLongForces;
+    ComputeArray moleculeAtoms;
+    ComputeArray moleculeStartIndex;
+    ComputeKernel kernel;
+    std::vector<int> lastAtomOrder;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_COMMONKERNELS_H_*/
--- a/platforms/common/include/openmm/common/ComputeContext.h
+++ b/platforms/common/include/openmm/common/ComputeContext.h
@@ -300,10 +300,14 @@ public:
    virtual ArrayInterface& getVelm() = 0;
    /**
     * On devices that do not support 64 bit atomics, this returns an array containing buffers of type real4 in which
-     * forces can be accumulated.  Do not call this if getSupports64BitGlobalAtomics() returns true.  The returned value
-     * in that case is undefined, and it may throw an exception.
+     * forces can be accumulated.  On platforms that do not use floating point force buffers, this will throw an exception.
     */
    virtual ArrayInterface& getForceBuffers() = 0;
+    /**
+     * Get the array which contains a contribution to each force represented as a real4.  On platforms that do not use
+     * floating point force buffers, this will throw an exception.
+     */
+    virtual ArrayInterface& getFloatForceBuffer() = 0;
    /**
     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
     */

--- a/platforms/common/src/CommonKernels.cpp
+++ b/platforms/common/src/CommonKernels.cpp
@@ -98,6 +98,49 @@ static pair<ExpressionTreeNode, string> makeVariable(const string& name, const s
    return make_pair(ExpressionTreeNode(new Operation::Variable(name)), value);
 }

+static void replaceFunctionsInExpression(map<string, CustomFunction*>& functions, ExpressionProgram& expression) {
+    for (int i = 0; i < expression.getNumOperations(); i++) {
+        if (expression.getOperation(i).getId() == Operation::CUSTOM) {
+            const Operation::Custom& op = dynamic_cast<const Operation::Custom&>(expression.getOperation(i));
+            expression.setOperation(i, new Operation::Custom(op.getName(), functions[op.getName()]->clone(), op.getDerivOrder()));
+        }
+    }
+}
+
+void CommonApplyConstraintsKernel::initialize(const System& system) {
+}
+
+void CommonApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
+    cc.setAsCurrent();
+    if (!hasInitializedKernel) {
+        hasInitializedKernel = true;
+        map<string, string> defines;
+        ComputeProgram program = cc.compileProgram(CommonKernelSources::constraints, defines);
+        applyDeltasKernel = program->createKernel("applyPositionDeltas");
+        applyDeltasKernel->addArg(cc.getNumAtoms());
+        applyDeltasKernel->addArg(cc.getPosq());
+        applyDeltasKernel->addArg(cc.getIntegrationUtilities().getPosDelta());
+        if (cc.getUseMixedPrecision())
+            applyDeltasKernel->addArg(cc.getPosqCorrection());
+    }
+    IntegrationUtilities& integration = cc.getIntegrationUtilities();
+    cc.clearBuffer(integration.getPosDelta());
+    integration.applyConstraints(tol);
+    applyDeltasKernel->execute(cc.getNumAtoms());
+    integration.computeVirtualSites();
+}
+
+void CommonApplyConstraintsKernel::applyToVelocities(ContextImpl& context, double tol) {
+    cc.getIntegrationUtilities().applyVelocityConstraints(tol);
+}
+
+void CommonVirtualSitesKernel::initialize(const System& system) {
+}
+
+void CommonVirtualSitesKernel::computePositions(ContextImpl& context) {
+    cc.getIntegrationUtilities().computeVirtualSites();
+}
+
 class CommonCalcHarmonicBondForceKernel::ForceInfo : public ComputeForceInfo {
 public:
    ForceInfo(const HarmonicBondForce& force) : force(force) {
@@ -4993,6 +5036,225 @@ void CommonCalcGayBerneForceKernel::sortAtoms() {
    exclusionStartIndex.upload(startIndexVec);
 }

+class CommonCalcCustomCVForceKernel::ForceInfo : public ComputeForceInfo {
+public:
+    ForceInfo(ComputeForceInfo& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        return force.areParticlesIdentical(particle1, particle2);
+    }
+    int getNumParticleGroups() {
+        return force.getNumParticleGroups();
+    }
+    void getParticlesInGroup(int index, std::vector<int>& particles) {
+        force.getParticlesInGroup(index, particles);
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        return force.areGroupsIdentical(group1, group2);
+    }
+private:
+    ComputeForceInfo& force;
+};
+
+class CommonCalcCustomCVForceKernel::ReorderListener : public ComputeContext::ReorderListener {
+public:
+    ReorderListener(ComputeContext& cc, ArrayInterface& invAtomOrder) : cc(cc), invAtomOrder(invAtomOrder) {
+    }
+    void execute() {
+        vector<int> invOrder(cc.getPaddedNumAtoms());
+        const vector<int>& order = cc.getAtomIndex();
+        for (int i = 0; i < order.size(); i++)
+            invOrder[order[i]] = i;
+        invAtomOrder.upload(invOrder);
+    }
+private:
+    ComputeContext& cc;
+    ArrayInterface& invAtomOrder;
+};
+
+void CommonCalcCustomCVForceKernel::initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext) {
+    int numCVs = force.getNumCollectiveVariables();
+    for (int i = 0; i < force.getNumGlobalParameters(); i++)
+        globalParameterNames.push_back(force.getGlobalParameterName(i));
+    for (int i = 0; i < numCVs; i++)
+        variableNames.push_back(force.getCollectiveVariableName(i));
+    for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
+        string name = force.getEnergyParameterDerivativeName(i);
+        paramDerivNames.push_back(name);
+        cc.addEnergyParameterDerivative(name);
+    }
+
+    // Create custom functions for the tabulated functions.
+
+    map<string, Lepton::CustomFunction*> functions;
+    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
+        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
+
+    // Create the expressions.
+
+    Lepton::ParsedExpression energyExpr = Lepton::Parser::parse(force.getEnergyFunction(), functions);
+    energyExpression = energyExpr.createProgram();
+    variableDerivExpressions.clear();
+    for (auto& name : variableNames)
+        variableDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
+    paramDerivExpressions.clear();
+    for (auto& name : paramDerivNames)
+        paramDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
+
+    // Delete the custom functions.
+
+    for (auto& function : functions)
+        delete function.second;
+
+    // Copy parameter derivatives from the inner context.
+
+    ComputeContext& cc2 = getInnerComputeContext(innerContext);
+    for (auto& param : cc2.getEnergyParamDerivNames())
+        cc.addEnergyParameterDerivative(param);
+    
+    // Create arrays for storing information.
+    
+    cvForces.resize(numCVs);
+    for (int i = 0; i < numCVs; i++)
+        cvForces[i].initialize<long long>(cc, 3*cc.getPaddedNumAtoms(), "cvForce");
+    invAtomOrder.initialize<int>(cc, cc.getPaddedNumAtoms(), "invAtomOrder");
+    innerInvAtomOrder.initialize<int>(cc, cc.getPaddedNumAtoms(), "innerInvAtomOrder");
+    
+    // Create the kernels.
+    
+    stringstream args, add;
+    for (int i = 0; i < numCVs; i++) {
+        args << ", GLOBAL mm_long * RESTRICT force" << i << ", real dEdV" << i;
+        add << "forces[i] += (mm_long) (force" << i << "[i]*dEdV" << i << ");\n";
+    }
+    map<string, string> replacements;
+    replacements["PARAMETER_ARGUMENTS"] = args.str();
+    replacements["ADD_FORCES"] = add.str();
+    ComputeProgram program = cc.compileProgram(cc.replaceStrings(CommonKernelSources::customCVForce, replacements));
+    copyStateKernel = program->createKernel("copyState");
+    copyStateKernel->addArg(cc.getPosq());
+    copyStateKernel->addArg(cc2.getPosq());
+    if (cc.getUseMixedPrecision()) {
+        copyStateKernel->addArg(cc.getPosqCorrection());
+        copyStateKernel->addArg(cc2.getPosqCorrection());
+    }
+    copyStateKernel->addArg(cc.getVelm());
+    copyStateKernel->addArg(cc2.getVelm());
+    copyStateKernel->addArg(cc.getAtomIndexArray());
+    copyStateKernel->addArg(innerInvAtomOrder);
+    copyStateKernel->addArg(cc.getNumAtoms());
+    copyForcesKernel = program->createKernel("copyForces");
+    copyForcesKernel->addArg();
+    copyForcesKernel->addArg(invAtomOrder);
+    copyForcesKernel->addArg(cc2.getLongForceBuffer());
+    copyForcesKernel->addArg(cc2.getAtomIndexArray());
+    copyForcesKernel->addArg(cc.getNumAtoms());
+    copyForcesKernel->addArg(cc.getPaddedNumAtoms());
+    addForcesKernel = program->createKernel("addForces");
+    addForcesKernel->addArg(cc.getLongForceBuffer());
+    addForcesKernel->addArg(cc.getLongForceBuffer().getSize());
+    for (int i = 0; i < numCVs; i++) {
+        addForcesKernel->addArg();
+        addForcesKernel->addArg();
+    }
+
+    // This context needs to respect all forces in the inner context when reordering atoms.
+
+    for (auto* info : cc2.getForceInfos())
+        cc.addForce(new ForceInfo(*info));
+}
+
+double CommonCalcCustomCVForceKernel::execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy) {
+    copyState(context, innerContext);
+    int numCVs = variableNames.size();
+    int numAtoms = cc.getNumAtoms();
+    int paddedNumAtoms = cc.getPaddedNumAtoms();
+    vector<double> cvValues;
+    vector<map<string, double> > cvDerivs(numCVs);
+    for (int i = 0; i < numCVs; i++) {
+        cvValues.push_back(innerContext.calcForcesAndEnergy(true, true, 1<<i));
+        copyForcesKernel->setArg(0, cvForces[i]);
+        copyForcesKernel->execute(numAtoms);
+        innerContext.getEnergyParameterDerivatives(cvDerivs[i]);
+    }
+    
+    // Compute the energy and forces.
+    
+    map<string, double> variables;
+    for (auto& name : globalParameterNames)
+        variables[name] = context.getParameter(name);
+    for (int i = 0; i < numCVs; i++)
+        variables[variableNames[i]] = cvValues[i];
+    double energy = energyExpression.evaluate(variables);
+    for (int i = 0; i < numCVs; i++) {
+        double dEdV = variableDerivExpressions[i].evaluate(variables);
+        addForcesKernel->setArg(2*i+2, cvForces[i]);
+        if (cc.getUseDoublePrecision())
+            addForcesKernel->setArg(2*i+3, dEdV);
+        else
+            addForcesKernel->setArg(2*i+3, (float) dEdV);
+    }
+    addForcesKernel->execute(numAtoms);
+    
+    // Compute the energy parameter derivatives.
+    
+    map<string, double>& energyParamDerivs = cc.getEnergyParamDerivWorkspace();
+    for (int i = 0; i < paramDerivExpressions.size(); i++)
+        energyParamDerivs[paramDerivNames[i]] += paramDerivExpressions[i].evaluate(variables);
+    for (int i = 0; i < numCVs; i++) {
+        double dEdV = variableDerivExpressions[i].evaluate(variables);
+        for (auto& deriv : cvDerivs[i])
+            energyParamDerivs[deriv.first] += dEdV*deriv.second;
+    }
+    return energy;
+}
+
+void CommonCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl& innerContext) {
+    int numAtoms = cc.getNumAtoms();
+    ComputeContext& cc2 = getInnerComputeContext(innerContext);
+    if (!hasInitializedListeners) {
+        hasInitializedListeners = true;
+        
+        // Initialize the listeners.
+        
+        ReorderListener* listener1 = new ReorderListener(cc, invAtomOrder);
+        ReorderListener* listener2 = new ReorderListener(cc2, innerInvAtomOrder);
+        cc.addReorderListener(listener1);
+        cc2.addReorderListener(listener2);
+        listener1->execute();
+        listener2->execute();
+    }
+    copyStateKernel->execute(numAtoms);
+    Vec3 a, b, c;
+    context.getPeriodicBoxVectors(a, b, c);
+    innerContext.setPeriodicBoxVectors(a, b, c);
+    innerContext.setTime(context.getTime());
+    map<string, double> innerParameters = innerContext.getParameters();
+    for (auto& param : innerParameters)
+        innerContext.setParameter(param.first, context.getParameter(param.first));
+}
+
+void CommonCalcCustomCVForceKernel::copyParametersToContext(ContextImpl& context, const CustomCVForce& force) {
+    // Create custom functions for the tabulated functions.
+
+    map<string, CustomFunction*> functions;
+    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
+        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
+
+    // Replace tabulated functions in the expressions.
+
+    replaceFunctionsInExpression(functions, energyExpression);
+    for (auto& expression : variableDerivExpressions)
+        replaceFunctionsInExpression(functions, expression);
+    for (auto& expression : paramDerivExpressions)
+        replaceFunctionsInExpression(functions, expression);
+
+    // Delete the custom functions.
+
+    for (auto& function : functions)
+        delete function.second;
+}
+
 void CommonIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
    cc.initializeContexts();
    cc.setAsCurrent();
@@ -7345,3 +7607,83 @@ void CommonApplyAndersenThermostatKernel::execute(ContextImpl& context) {
    kernel->setArg(6, cc.getIntegrationUtilities().prepareRandomNumbers(cc.getPaddedNumAtoms()));
    kernel->execute(cc.getNumAtoms());
 }
+
+void CommonApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
+    savedPositions.initialize(cc, cc.getPaddedNumAtoms(), cc.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedPositions");
+    savedLongForces.initialize<long long>(cc, cc.getPaddedNumAtoms()*3, "savedLongForces");
+    try {
+        cc.getFloatForceBuffer(); // This will throw an exception on the CUDA platform.
+        savedFloatForces.initialize(cc, cc.getPaddedNumAtoms(), cc.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedForces");
+    }
+    catch (...) {
+        // The CUDA platform doesn't have a floating point force buffer, so we don't need to copy it.
+    }
+    ComputeProgram program = cc.compileProgram(CommonKernelSources::monteCarloBarostat);
+    kernel = program->createKernel("scalePositions");
+}
+
+void CommonApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scaleX, double scaleY, double scaleZ) {
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+
+        // Create the arrays with the molecule definitions.
+
+        vector<vector<int> > molecules = context.getMolecules();
+        numMolecules = molecules.size();
+        moleculeAtoms.initialize<int>(cc, cc.getNumAtoms(), "moleculeAtoms");
+        moleculeStartIndex.initialize<int>(cc, numMolecules+1, "moleculeStartIndex");
+        vector<int> atoms(moleculeAtoms.getSize());
+        vector<int> startIndex(moleculeStartIndex.getSize());
+        int index = 0;
+        for (int i = 0; i < numMolecules; i++) {
+            startIndex[i] = index;
+            for (int molecule : molecules[i])
+                atoms[index++] = molecule;
+        }
+        startIndex[numMolecules] = index;
+        moleculeAtoms.upload(atoms);
+        moleculeStartIndex.upload(startIndex);
+
+        // Initialize the kernel arguments.
+
+//KERNEL void scalePositions(float scaleX, float scaleY, float scaleZ, int numMolecules, real4 periodicBoxSize,
+//        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, GLOBAL real4* RESTRICT posq,
+//        GLOBAL const int* RESTRICT moleculeAtoms, GLOBAL const int* RESTRICT moleculeStartIndex) {
+        
+        kernel->addArg();
+        kernel->addArg();
+        kernel->addArg();
+        kernel->addArg(numMolecules);
+        for (int i = 0; i < 5; i++)
+            kernel->addArg();
+        kernel->addArg(cc.getPosq());
+        kernel->addArg(moleculeAtoms);
+        kernel->addArg(moleculeStartIndex);
+    }
+    cc.getPosq().copyTo(savedPositions);
+    cc.getLongForceBuffer().copyTo(savedLongForces);
+    if (savedFloatForces.isInitialized())
+        cc.getFloatForceBuffer().copyTo(savedFloatForces);
+//    int bytesToCopy = cc.getPosq().getSize()*(cc.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
+//    cc.getQueue().enqueueCopyBuffer(cc.getPosq().getDeviceBuffer(), savedPositions.getDeviceBuffer(), 0, 0, bytesToCopy);
+//    cc.getQueue().enqueueCopyBuffer(cc.getForce().getDeviceBuffer(), savedForces.getDeviceBuffer(), 0, 0, bytesToCopy);
+    kernel->setArg(0, (float) scaleX);
+    kernel->setArg(1, (float) scaleY);
+    kernel->setArg(2, (float) scaleZ);
+    setPeriodicBoxArgs(cc, kernel, 4);
+    kernel->execute(cc.getNumAtoms());
+    for (auto& offset : cc.getPosCellOffsets())
+        offset = mm_int4(0, 0, 0, 0);
+    lastAtomOrder = cc.getAtomIndex();
+}
+
+void CommonApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context) {
+    savedPositions.copyTo(cc.getPosq());
+    savedLongForces.copyTo(cc.getLongForceBuffer());
+    if (savedFloatForces.isInitialized())
+        savedFloatForces.copyTo(cc.getFloatForceBuffer());
+
+//    int bytesToCopy = cc.getPosq().getSize()*(cc.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
+//    cc.getQueue().enqueueCopyBuffer(savedPositions.getDeviceBuffer(), cc.getPosq().getDeviceBuffer(), 0, 0, bytesToCopy);
+//    cc.getQueue().enqueueCopyBuffer(savedForces.getDeviceBuffer(), cc.getForce().getDeviceBuffer(), 0, 0, bytesToCopy);
+}
--- a/platforms/cuda/src/kernels/constraints.cu
+++ b/platforms/cuda/src/kernels/constraints.cu
-extern "C" __global__ void applyPositionDeltas(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+KERNEL void applyPositionDeltas(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT posDelta
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL real4* RESTRICT posqCorrection
+#endif
+        ) {
+    for (unsigned int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
 #ifdef USE_MIXED_PRECISION
        real4 pos1 = posq[index];
        real4 pos2 = posqCorrection[index];

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
--- a/platforms/cuda/src/kernels/customCVForce.cu
+++ b/platforms/cuda/src/kernels/customCVForce.cu
 /**
 * Copy the positions and velocities to the inner context.
 */
-extern "C" __global__ void copyState(real4* posq, real4* posqCorrection, mixed4* velm, int* __restrict__ atomOrder,
-        real4* innerPosq, real4* innerPosqCorrection, mixed4* innerVelm, int* __restrict__ innerInvAtomOrder,
-        int numAtoms) {
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+KERNEL void copyState(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT innerPosq,
+#ifdef USE_MIXED_PRECISION
+        GLOBAL real4* RESTRICT posqCorrection, GLOBAL real4* RESTRICT innerPosqCorrection,
+#endif
+        GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT innerVelm, GLOBAL int* RESTRICT atomOrder, GLOBAL int* RESTRICT innerInvAtomOrder, int numAtoms) {
+    for (int i = GLOBAL_ID; i < numAtoms; i += GLOBAL_SIZE) {
        int index = innerInvAtomOrder[atomOrder[i]];
        innerPosq[index] = posq[i];
        innerVelm[index] = velm[i];
@@ -17,9 +19,9 @@ extern "C" __global__ void copyState(real4* posq, real4* posqCorrection, mixed4*
 /**
 * Copy the forces back to the main context.
 */
-extern "C" __global__ void copyForces(long long* forces, int* __restrict__ invAtomOrder, long long* innerForces,
-        int* __restrict__ innerAtomOrder, int numAtoms, int paddedNumAtoms) {
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+KERNEL void copyForces(GLOBAL mm_long* RESTRICT forces, GLOBAL int* RESTRICT invAtomOrder, GLOBAL mm_long* RESTRICT innerForces,
+        GLOBAL int* RESTRICT innerAtomOrder, int numAtoms, int paddedNumAtoms) {
+    for (int i = GLOBAL_ID; i < numAtoms; i += GLOBAL_SIZE) {
        int index = invAtomOrder[innerAtomOrder[i]];
        forces[index] = innerForces[i];
        forces[index+paddedNumAtoms] = innerForces[i+paddedNumAtoms];
@@ -30,9 +32,9 @@ extern "C" __global__ void copyForces(long long* forces, int* __restrict__ invAt
 /**
 * Add all the forces from the CVs.
 */
-extern "C" __global__ void addForces(long long* forces, int bufferSize
+KERNEL void addForces(GLOBAL mm_long* RESTRICT forces, int bufferSize
    PARAMETER_ARGUMENTS) {
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < bufferSize; i += blockDim.x*gridDim.x) {
+    for (int i = GLOBAL_ID; i < bufferSize; i += GLOBAL_SIZE) {
        ADD_FORCES
    }
 }
--- a/platforms/cuda/src/kernels/ewald.cu
+++ b/platforms/cuda/src/kernels/ewald.cu
-__device__ real2 multofReal2(real2 a, real2 b) {
+DEVICE real2 multofReal2(real2 a, real2 b) {
    return make_real2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
 }

@@ -6,17 +6,17 @@ __device__ real2 multofReal2(real2 a, real2 b) {
 * Precompute the cosine and sine sums which appear in each force term.
 */

-extern "C" __global__ void calculateEwaldCosSinSums(mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
+KERNEL void calculateEwaldCosSinSums(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL real2* RESTRICT cosSinSum, real4 periodicBoxSize) {
    const unsigned int ksizex = 2*KMAX_X-1;
    const unsigned int ksizey = 2*KMAX_Y-1;
    const unsigned int ksizez = 2*KMAX_Z-1;
    const unsigned int totalK = ksizex*ksizey*ksizez;
    real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
    real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
-    unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
+    unsigned int index = GLOBAL_ID;
    mixed energy = 0;
    while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
-        index += blockDim.x*gridDim.x;
+        index += GLOBAL_SIZE;
    while (index < totalK) {
        // Find the wave vector (kx, ky, kz) this index corresponds to.

@@ -49,9 +49,9 @@ extern "C" __global__ void calculateEwaldCosSinSums(mixed* __restrict__ energyBu
        real k2 = kx*kx + ky*ky + kz*kz;
        real ak = EXP(k2*EXP_COEFFICIENT) / k2;
        energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
-        index += blockDim.x*gridDim.x;
+        index += GLOBAL_SIZE;
    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
 }

 /**
@@ -59,8 +59,8 @@ extern "C" __global__ void calculateEwaldCosSinSums(mixed* __restrict__ energyBu
 * previous routine.
 */

-extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ forceBuffers, const real4* __restrict__ posq, const real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
-    unsigned int atom = blockIdx.x*blockDim.x+threadIdx.x;
+KERNEL void calculateEwaldForces(GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real4* RESTRICT posq, GLOBAL const real2* RESTRICT cosSinSum, real4 periodicBoxSize) {
+    unsigned int atom = GLOBAL_ID;
    real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
    real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
    while (atom < NUM_ATOMS) {
@@ -102,9 +102,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__

        // Record the force on the atom.

-        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-        atom += blockDim.x*gridDim.x;
+        ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[atom+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
+        atom += GLOBAL_SIZE;
    }
 }
--- a/platforms/cuda/src/kernels/monteCarloBarostat.cu
+++ b/platforms/cuda/src/kernels/monteCarloBarostat.cu
 /**
- * Scale the particle positions with each axis independent
+ * Scale the particle positions with each axis independent.
 */

-extern "C" __global__ void scalePositions(float scaleX, float scaleY, float scaleZ, int numMolecules, real4 periodicBoxSize,
-        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4* __restrict__ posq,
-        const int* __restrict__ moleculeAtoms, const int* __restrict__ moleculeStartIndex) {
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numMolecules; index += blockDim.x*gridDim.x) {
+KERNEL void scalePositions(float scaleX, float scaleY, float scaleZ, int numMolecules, real4 periodicBoxSize,
+        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, GLOBAL real4* RESTRICT posq,
+        GLOBAL const int* RESTRICT moleculeAtoms, GLOBAL const int* RESTRICT moleculeStartIndex) {
+    for (int index = GLOBAL_ID; index < numMolecules; index += GLOBAL_SIZE) {
        int first = moleculeStartIndex[index];
        int last = moleculeStartIndex[index+1];
        int numAtoms = last-first;
@@ -19,7 +19,7 @@ extern "C" __global__ void scalePositions(float scaleX, float scaleY, float scal
            center.y += pos.y;
            center.z += pos.z;
        }
-        real invNumAtoms = RECIP(numAtoms);
+        real invNumAtoms = RECIP((real) numAtoms);
        center.x *= invNumAtoms;
        center.y *= invNumAtoms;
        center.z *= invNumAtoms;

--- a/platforms/cuda/src/kernels/nonbondedExceptions.cu
+++ b/platforms/cuda/src/kernels/nonbondedExceptions.cu
--- a/platforms/cuda/src/kernels/nonbondedParameters.cu
+++ b/platforms/cuda/src/kernels/nonbondedParameters.cu
 /**
 * Compute the nonbonded parameters for particles and exceptions.
 */
-extern "C" __global__ void computeParameters(mixed* __restrict__ energyBuffer, bool includeSelfEnergy, real* __restrict__ globalParams,
-        int numAtoms, const float4* __restrict__ baseParticleParams, real4* __restrict__ posq, real* __restrict__ charge,
-        float2* __restrict__ sigmaEpsilon, float4* __restrict__ particleParamOffsets, int* __restrict__ particleOffsetIndices
+KERNEL void computeParameters(GLOBAL mixed* RESTRICT energyBuffer, int includeSelfEnergy, GLOBAL real* RESTRICT globalParams,
+        int numAtoms, GLOBAL const float4* RESTRICT baseParticleParams, GLOBAL real4* RESTRICT posq, GLOBAL real* RESTRICT charge,
+        GLOBAL float2* RESTRICT sigmaEpsilon, GLOBAL float4* RESTRICT particleParamOffsets, GLOBAL int* RESTRICT particleOffsetIndices
 #ifdef HAS_EXCEPTIONS
-        , int numExceptions, const float4* __restrict__ baseExceptionParams, float4* __restrict__ exceptionParams,
-        float4* __restrict__ exceptionParamOffsets, int* __restrict__ exceptionOffsetIndices
+        , int numExceptions, GLOBAL const float4* RESTRICT baseExceptionParams, GLOBAL float4* RESTRICT exceptionParams,
+        GLOBAL float4* RESTRICT exceptionParamOffsets, GLOBAL int* RESTRICT exceptionOffsetIndices
 #endif
        ) {
    mixed energy = 0;

    // Compute particle parameters.
    
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+    for (int i = GLOBAL_ID; i < numAtoms; i += GLOBAL_SIZE) {
        float4 params = baseParticleParams[i];
 #ifdef HAS_OFFSETS
        int start = particleOffsetIndices[i], end = particleOffsetIndices[i+1];
@@ -45,7 +45,7 @@ extern "C" __global__ void computeParameters(mixed* __restrict__ energyBuffer, b
    // Compute exception parameters.
    
 #ifdef HAS_EXCEPTIONS
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numExceptions; i += blockDim.x*gridDim.x) {
+    for (int i = GLOBAL_ID; i < numExceptions; i += GLOBAL_SIZE) {
        float4 params = baseExceptionParams[i];
 #ifdef HAS_OFFSETS
        int start = exceptionOffsetIndices[i], end = exceptionOffsetIndices[i+1];
@@ -61,15 +61,15 @@ extern "C" __global__ void computeParameters(mixed* __restrict__ energyBuffer, b
    }
 #endif
    if (includeSelfEnergy)
-        energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+        energyBuffer[GLOBAL_ID] += energy;
 }

 /**
 * Compute parameters for subtracting the reciprocal part of excluded interactions.
 */
-extern "C" __global__ void computeExclusionParameters(real4* __restrict__ posq, real* __restrict__ charge, float2* __restrict__ sigmaEpsilon,
-        int numExclusions, const int2* __restrict__ exclusionAtoms, float4* __restrict__ exclusionParams) {
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numExclusions; i += blockDim.x*gridDim.x) {
+KERNEL void computeExclusionParameters(GLOBAL real4* RESTRICT posq, GLOBAL real* RESTRICT charge, GLOBAL float2* RESTRICT sigmaEpsilon,
+        int numExclusions, GLOBAL const int2* RESTRICT exclusionAtoms, GLOBAL float4* RESTRICT exclusionParams) {
+    for (int i = GLOBAL_ID; i < numExclusions; i += GLOBAL_SIZE) {
        int2 atoms = exclusionAtoms[i];
 #ifdef USE_POSQ_CHARGES
        real chargeProd = posq[atoms.x].w*posq[atoms.y].w;

--- a/platforms/cuda/src/kernels/pmeExclusions.cu
+++ b/platforms/cuda/src/kernels/pmeExclusions.cu
@@ -36,3 +36,4 @@ if (r > 0)
    delta *= tempForce*invR*invR;
 real3 force1 = -delta;
 real3 force2 = delta;
+
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -196,6 +196,12 @@ public:
    CudaArray& getForce() {
        return force;
    }
+    /**
+     * The CUDA platform does not use floating point force buffers, so this throws an exception.
+     */
+    ArrayInterface& getFloatForceBuffer() {
+        throw OpenMMException("CUDA platform does not use floating point force buffers");
+    }
    /**
     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
     * This is a synonym for getForce().  It exists to satisfy the ComputeContext interface.
@@ -205,7 +211,6 @@ public:
    }
    /**
     * All CUDA devices support 64 bit atomics, so this throws an exception.
-     * @return 
     */
    ArrayInterface& getForceBuffers() {
        throw OpenMMException("CUDA platform does not use floating point force buffers");

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -31,14 +31,10 @@
 #include "CudaArray.h"
 #include "CudaContext.h"
 #include "CudaFFT3D.h"
-#include "CudaParameterSet.h"
 #include "CudaSort.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
-#include "openmm/internal/CompiledExpressionSet.h"
-#include "openmm/internal/CustomIntegratorUtilities.h"
-#include "lepton/CompiledExpression.h"
-#include "lepton/ExpressionProgram.h"
+#include "openmm/common/CommonKernels.h"
 #include <cufft.h>

 namespace OpenMM {
@@ -206,63 +202,6 @@ private:
    CudaContext& cu;
 };

-/**
- * This kernel modifies the positions of particles to enforce distance constraints.
- */
-class CudaApplyConstraintsKernel : public ApplyConstraintsKernel {
-public:
-    CudaApplyConstraintsKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyConstraintsKernel(name, platform),
-            cu(cu), hasInitializedKernel(false) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * Update particle positions to enforce constraints.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param tol        the distance tolerance within which constraints must be satisfied.
-     */
-    void apply(ContextImpl& context, double tol);
-    /**
-     * Update particle velocities to enforce constraints.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param tol        the velocity tolerance within which constraints must be satisfied.
-     */
-    void applyToVelocities(ContextImpl& context, double tol);
-private:
-    CudaContext& cu;
-    bool hasInitializedKernel;
-    CUfunction applyDeltasKernel;
-};
-
-/**
- * This kernel recomputes the positions of virtual sites.
- */
-class CudaVirtualSitesKernel : public VirtualSitesKernel {
-public:
-    CudaVirtualSitesKernel(std::string name, const Platform& platform, CudaContext& cu) : VirtualSitesKernel(name, platform), cu(cu) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * Compute the virtual site locations.
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void computePositions(ContextImpl& context);
-private:
-    CudaContext& cu;
-};
-
 /**
 * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
 */
@@ -399,103 +338,13 @@ private:
 /**
 * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
 */
-class CudaCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
+class CudaCalcCustomCVForceKernel : public CommonCalcCustomCVForceKernel {
 public:
-    CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcCustomCVForceKernel(name, platform),
-            cu(cu), hasInitializedListeners(false) {
+    CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, ComputeContext& cc) : CommonCalcCustomCVForceKernel(name, platform, cc) {
    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomCVForce this kernel will be used for
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     */
-    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
-    /**
-     * Copy state information to the inner context.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     */
-    void copyState(ContextImpl& context, ContextImpl& innerContext);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomCVForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomCVForce& force);
-private:
-    class ForceInfo;
-    class ReorderListener;
-    CudaContext& cu;
-    bool hasInitializedListeners;
-    Lepton::ExpressionProgram energyExpression;
-    std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
-    std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
-    std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
-    std::vector<CudaArray> cvForces;
-    CudaArray invAtomOrder;
-    CudaArray innerInvAtomOrder;
-    CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
-};
-
-/**
- * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
- */
-class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
-public:
-    CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyMonteCarloBarostatKernel(name, platform), cu(cu),
-            hasInitializedKernels(false) {
+    ComputeContext& getInnerComputeContext(ContextImpl& innerContext) {
+        return *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param barostat   the MonteCarloBarostat this kernel will be used for
-     */
-    void initialize(const System& system, const Force& barostat);
-    /**
-     * Attempt a Monte Carlo step, scaling particle positions (or cluster centers) by a specified value.
-     * This version scales the x, y, and z positions independently.
-     * This is called BEFORE the periodic box size is modified.  It should begin by translating each particle
-     * or cluster into the first periodic box, so that coordinates will still be correct after the box size
-     * is changed.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param scaleX     the scale factor by which to multiply particle x-coordinate
-     * @param scaleY     the scale factor by which to multiply particle y-coordinate
-     * @param scaleZ     the scale factor by which to multiply particle z-coordinate
-     */
-    void scaleCoordinates(ContextImpl& context, double scaleX, double scaleY, double scaleZ);
-    /**
-     * Reject the most recent Monte Carlo step, restoring the particle positions to where they were before
-     * scaleCoordinates() was last called.
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void restoreCoordinates(ContextImpl& context);
-private:
-    CudaContext& cu;
-    bool hasInitializedKernels;
-    int numMolecules;
-    CudaArray savedPositions;
-    CudaArray savedForces;
-    CudaArray moleculeAtoms;
-    CudaArray moleculeStartIndex;
-    CUfunction kernel;
-    std::vector<int> lastAtomOrder;
 };

 } // namespace OpenMM

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -74,9 +74,9 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
    if (name == UpdateStateDataKernel::Name())
        return new CudaUpdateStateDataKernel(name, platform, cu);
    if (name == ApplyConstraintsKernel::Name())
-        return new CudaApplyConstraintsKernel(name, platform, cu);
+        return new CommonApplyConstraintsKernel(name, platform, cu);
    if (name == VirtualSitesKernel::Name())
-        return new CudaVirtualSitesKernel(name, platform, cu);
+        return new CommonVirtualSitesKernel(name, platform, cu);
    if (name == CalcHarmonicBondForceKernel::Name())
        return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomBondForceKernel::Name())
@@ -136,7 +136,7 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
    if (name == IntegrateNoseHooverStepKernel::Name())
        return new CommonIntegrateNoseHooverStepKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())
-        return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
+        return new CommonApplyMonteCarloBarostatKernel(name, platform, cu);
    if (name == RemoveCMMotionKernel::Name())
        return new CommonRemoveCMMotionKernel(name, platform, cu);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -27,23 +27,14 @@
 #include "CudaKernels.h"
 #include "CudaForceInfo.h"
 #include "openmm/Context.h"
-#include "openmm/internal/AndersenThermostatImpl.h"
 #include "openmm/internal/ContextImpl.h"
-#include "openmm/internal/CustomCompoundBondForceImpl.h"
-#include "openmm/internal/CustomHbondForceImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
-#include "openmm/internal/OSRngSeed.h"
+#include "CommonKernelSources.h"
 #include "CudaBondedUtilities.h"
 #include "CudaExpressionUtilities.h"
 #include "CudaIntegrationUtilities.h"
 #include "CudaNonbondedUtilities.h"
 #include "CudaKernelSources.h"
-#include "lepton/CustomFunction.h"
-#include "lepton/ExpressionTreeNode.h"
-#include "lepton/Operation.h"
-#include "lepton/Parser.h"
-#include "lepton/ParsedExpression.h"
-#include "ReferenceTabulatedFunction.h"
 #include "SimTKOpenMMRealType.h"
 #include "SimTKOpenMMUtilities.h"
 #include <algorithm>
@@ -54,7 +45,6 @@

 using namespace OpenMM;
 using namespace std;
-using namespace Lepton;

 #define CHECK_RESULT(result, prefix) \
    if (result != CUDA_SUCCESS) { \
@@ -63,40 +53,6 @@ using namespace Lepton;
        throw OpenMMException(m.str());\
    }

-static bool isZeroExpression(const Lepton::ParsedExpression& expression) {
-    const Lepton::Operation& op = expression.getRootNode().getOperation();
-    if (op.getId() != Lepton::Operation::CONSTANT)
-        return false;
-    return (dynamic_cast<const Lepton::Operation::Constant&>(op).getValue() == 0.0);
-}
-
-static bool usesVariable(const Lepton::ExpressionTreeNode& node, const string& variable) {
-    const Lepton::Operation& op = node.getOperation();
-    if (op.getId() == Lepton::Operation::VARIABLE && op.getName() == variable)
-        return true;
-    for (auto& child : node.getChildren())
-        if (usesVariable(child, variable))
-            return true;
-    return false;
-}
-
-static bool usesVariable(const Lepton::ParsedExpression& expression, const string& variable) {
-    return usesVariable(expression.getRootNode(), variable);
-}
-
-static pair<ExpressionTreeNode, string> makeVariable(const string& name, const string& value) {
-    return make_pair(ExpressionTreeNode(new Operation::Variable(name)), value);
-}
-
-static void replaceFunctionsInExpression(map<string, CustomFunction*>& functions, ExpressionProgram& expression) {
-    for (int i = 0; i < expression.getNumOperations(); i++) {
-        if (expression.getOperation(i).getId() == Operation::CUSTOM) {
-            const Operation::Custom& op = dynamic_cast<const Operation::Custom&>(expression.getOperation(i));
-            expression.setOperation(i, new Operation::Custom(op.getName(), functions[op.getName()]->clone(), op.getDerivOrder()));
-        }
-    }
-}
-
 void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
 }

@@ -453,38 +409,6 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
        listener->execute();
 }

-void CudaApplyConstraintsKernel::initialize(const System& system) {
-}
-
-void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
-    cu.setAsCurrent();
-    if (!hasInitializedKernel) {
-        hasInitializedKernel = true;
-        map<string, string> defines;
-        CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
-        applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
-    }
-    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
-    cu.clearBuffer(integration.getPosDelta());
-    integration.applyConstraints(tol);
-    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-    int numAtoms = cu.getNumAtoms();
-    void* args[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
-    cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms());
-    integration.computeVirtualSites();
-}
-
-void CudaApplyConstraintsKernel::applyToVelocities(ContextImpl& context, double tol) {
-    cu.getIntegrationUtilities().applyVelocityConstraints(tol);
-}
-
-void CudaVirtualSitesKernel::initialize(const System& system) {
-}
-
-void CudaVirtualSitesKernel::computePositions(ContextImpl& context) {
-    cu.getIntegrationUtilities().computeVirtualSites();
-}
-
 class CudaCalcNonbondedForceKernel::ForceInfo : public CudaForceInfo {
 public:
    ForceInfo(const NonbondedForce& force) : force(force) {
@@ -760,7 +684,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
            replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
            replacements["M_PI"] = cu.doubleToString(M_PI);
-            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::ewald, replacements);
+            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CommonKernelSources::ewald, replacements);
            ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
            ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
            int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
@@ -1055,13 +979,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
            if (doLJPME)
                replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
-            cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::pmeExclusions, replacements), force.getForceGroup());
+            cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
        }
    }

    // Add the interaction to the default nonbonded kernel.

-    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
+    string source = cu.replaceStrings(CommonKernelSources::coulombLennardJones, defines);
    charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
    baseParticleParams.initialize<float4>(cu, cu.getPaddedNumAtoms(), "baseParticleParams");
    baseParticleParams.upload(baseParticleParamVec);
@@ -1109,7 +1033,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        map<string, string> replacements;
        replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
        replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams.getDevicePointer(), "float4");
-        cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
+        cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
    }
    
    // Initialize parameter offsets.
@@ -1181,7 +1105,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    
    // Initialize the kernel for updating parameters.
    
-    CUmodule module = cu.createModule(CudaKernelSources::nonbondedParameters, paramsDefines);
+    CUmodule module = cu.createModule(CommonKernelSources::nonbondedParameters, paramsDefines);
    computeParamsKernel = cu.getKernel(module, "computeParameters");
    computeExclusionParamsKernel = cu.getKernel(module, "computeExclusionParameters");
    info = new ForceInfo(force);
@@ -1522,288 +1446,3 @@ void CudaCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, in
        nz = dispersionGridSizeZ;
    }
 }
-
-class CudaCalcCustomCVForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(ComputeForceInfo& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        return force.areParticlesIdentical(particle1, particle2);
-    }
-    int getNumParticleGroups() {
-        return force.getNumParticleGroups();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        force.getParticlesInGroup(index, particles);
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        return force.areGroupsIdentical(group1, group2);
-    }
-private:
-    ComputeForceInfo& force;
-};
-
-class CudaCalcCustomCVForceKernel::ReorderListener : public CudaContext::ReorderListener {
-public:
-    ReorderListener(CudaContext& cu, CudaArray& invAtomOrder) : cu(cu), invAtomOrder(invAtomOrder) {
-    }
-    void execute() {
-        vector<int> invOrder(cu.getPaddedNumAtoms());
-        const vector<int>& order = cu.getAtomIndex();
-        for (int i = 0; i < order.size(); i++)
-            invOrder[order[i]] = i;
-        invAtomOrder.upload(invOrder);
-    }
-private:
-    CudaContext& cu;
-    CudaArray& invAtomOrder;
-};
-
-void CudaCalcCustomCVForceKernel::initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext) {
-    int numCVs = force.getNumCollectiveVariables();
-    for (int i = 0; i < force.getNumGlobalParameters(); i++)
-        globalParameterNames.push_back(force.getGlobalParameterName(i));
-    for (int i = 0; i < numCVs; i++)
-        variableNames.push_back(force.getCollectiveVariableName(i));
-    for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
-        string name = force.getEnergyParameterDerivativeName(i);
-        paramDerivNames.push_back(name);
-        cu.addEnergyParameterDerivative(name);
-    }
-
-    // Create custom functions for the tabulated functions.
-
-    map<string, Lepton::CustomFunction*> functions;
-    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
-        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
-
-    // Create the expressions.
-
-    Lepton::ParsedExpression energyExpr = Lepton::Parser::parse(force.getEnergyFunction(), functions);
-    energyExpression = energyExpr.createProgram();
-    variableDerivExpressions.clear();
-    for (auto& name : variableNames)
-        variableDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
-    paramDerivExpressions.clear();
-    for (auto& name : paramDerivNames)
-        paramDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
-
-    // Delete the custom functions.
-
-    for (auto& function : functions)
-        delete function.second;
-
-    // Copy parameter derivatives from the inner context.
-
-    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
-    for (auto& param : cu2.getEnergyParamDerivNames())
-        cu.addEnergyParameterDerivative(param);
-    
-    // Create arrays for storing information.
-    
-    cvForces.resize(numCVs);
-    for (int i = 0; i < numCVs; i++)
-        cvForces[i].initialize<long long>(cu, 3*cu.getPaddedNumAtoms(), "cvForce");
-    invAtomOrder.initialize<int>(cu, cu.getPaddedNumAtoms(), "invAtomOrder");
-    innerInvAtomOrder.initialize<int>(cu, cu.getPaddedNumAtoms(), "innerInvAtomOrder");
-    
-    // Create the kernels.
-    
-    stringstream args, add;
-    for (int i = 0; i < numCVs; i++) {
-        args << ", long long* __restrict__ force" << i << ", real dEdV" << i;
-        add << "forces[i] += (long long) (force" << i << "[i]*dEdV" << i << ");\n";
-    }
-    map<string, string> replacements;
-    replacements["PARAMETER_ARGUMENTS"] = args.str();
-    replacements["ADD_FORCES"] = add.str();
-    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customCVForce, replacements));
-    copyStateKernel = cu.getKernel(module, "copyState");
-    copyForcesKernel = cu.getKernel(module, "copyForces");
-    addForcesKernel = cu.getKernel(module, "addForces");
-
-    // This context needs to respect all forces in the inner context when reordering atoms.
-
-    for (auto* info : cu2.getForceInfos())
-        cu.addForce(new ForceInfo(*info));
-}
-
-double CudaCalcCustomCVForceKernel::execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy) {
-    copyState(context, innerContext);
-    int numCVs = variableNames.size();
-    int numAtoms = cu.getNumAtoms();
-    int paddedNumAtoms = cu.getPaddedNumAtoms();
-    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
-    vector<double> cvValues;
-    vector<map<string, double> > cvDerivs(numCVs);
-    void* copyForcesArgs[] = {NULL, &invAtomOrder.getDevicePointer(), &cu2.getForce().getDevicePointer(), &cu2.getAtomIndexArray().getDevicePointer(), &numAtoms, &paddedNumAtoms};
-    for (int i = 0; i < numCVs; i++) {
-        cvValues.push_back(innerContext.calcForcesAndEnergy(true, true, 1<<i));
-        copyForcesArgs[0] = &cvForces[i].getDevicePointer();
-        cu.executeKernel(copyForcesKernel, copyForcesArgs, numAtoms);
-        innerContext.getEnergyParameterDerivatives(cvDerivs[i]);
-    }
-    
-    // Compute the energy and forces.
-    
-    map<string, double> variables;
-    for (auto& name : globalParameterNames)
-        variables[name] = context.getParameter(name);
-    for (int i = 0; i < numCVs; i++)
-        variables[variableNames[i]] = cvValues[i];
-    double energy = energyExpression.evaluate(variables);
-    int bufferSize = cu.getForce().getSize();
-    vector<void*> addForcesArgs;
-    addForcesArgs.push_back(&cu.getForce().getDevicePointer());
-    addForcesArgs.push_back(&bufferSize);
-    vector<double> dEdV(numCVs);
-    vector<float> dEdVFloat(numCVs);
-    for (int i = 0; i < numCVs; i++) {
-        dEdV[i] = variableDerivExpressions[i].evaluate(variables);
-        dEdVFloat[i] = (float) dEdV[i];
-        addForcesArgs.push_back(&cvForces[i].getDevicePointer());
-        if (cu.getUseDoublePrecision())
-            addForcesArgs.push_back(&dEdV[i]);
-        else
-            addForcesArgs.push_back(&dEdVFloat[i]);
-    }
-    cu.executeKernel(addForcesKernel, &addForcesArgs[0], numAtoms);
-    
-    // Compute the energy parameter derivatives.
-    
-    map<string, double>& energyParamDerivs = cu.getEnergyParamDerivWorkspace();
-    for (int i = 0; i < paramDerivExpressions.size(); i++)
-        energyParamDerivs[paramDerivNames[i]] += paramDerivExpressions[i].evaluate(variables);
-    for (int i = 0; i < numCVs; i++) {
-        double dEdV = variableDerivExpressions[i].evaluate(variables);
-        for (auto& deriv : cvDerivs[i])
-            energyParamDerivs[deriv.first] += dEdV*deriv.second;
-    }
-    return energy;
-}
-
-void CudaCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl& innerContext) {
-    int numAtoms = cu.getNumAtoms();
-    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
-    if (!hasInitializedListeners) {
-        hasInitializedListeners = true;
-        
-        // Initialize the listeners.
-        
-        ReorderListener* listener1 = new ReorderListener(cu, invAtomOrder);
-        ReorderListener* listener2 = new ReorderListener(cu2, innerInvAtomOrder);
-        cu.addReorderListener(listener1);
-        cu2.addReorderListener(listener2);
-        listener1->execute();
-        listener2->execute();
-    }
-    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-    CUdeviceptr posCorrection2 = (cu2.getUseMixedPrecision() ? cu2.getPosqCorrection().getDevicePointer() : 0);
-    void* copyStateArgs[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(),
-        &cu2.getPosq().getDevicePointer(), &posCorrection2,& cu2.getVelm().getDevicePointer(), &innerInvAtomOrder.getDevicePointer(), &numAtoms};
-    cu.executeKernel(copyStateKernel, copyStateArgs, numAtoms);
-    Vec3 a, b, c;
-    context.getPeriodicBoxVectors(a, b, c);
-    innerContext.setPeriodicBoxVectors(a, b, c);
-    innerContext.setTime(context.getTime());
-    map<string, double> innerParameters = innerContext.getParameters();
-    for (auto& param : innerParameters)
-        innerContext.setParameter(param.first, context.getParameter(param.first));
-}
-
-void CudaCalcCustomCVForceKernel::copyParametersToContext(ContextImpl& context, const CustomCVForce& force) {
-    // Create custom functions for the tabulated functions.
-
-    map<string, CustomFunction*> functions;
-    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
-        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
-
-    // Replace tabulated functions in the expressions.
-
-    replaceFunctionsInExpression(functions, energyExpression);
-    for (auto& expression : variableDerivExpressions)
-        replaceFunctionsInExpression(functions, expression);
-    for (auto& expression : paramDerivExpressions)
-        replaceFunctionsInExpression(functions, expression);
-
-    // Delete the custom functions.
-
-    for (auto& function : functions)
-        delete function.second;
-}
-
-void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
-    cu.setAsCurrent();
-    savedPositions.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4), "savedPositions");
-    savedForces.initialize<long long>(cu, cu.getPaddedNumAtoms()*3, "savedForces");
-    CUmodule module = cu.createModule(CudaKernelSources::monteCarloBarostat);
-    kernel = cu.getKernel(module, "scalePositions");
-}
-
-void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scaleX, double scaleY, double scaleZ) {
-    cu.setAsCurrent();
-    if (!hasInitializedKernels) {
-        hasInitializedKernels = true;
-
-        // Create the arrays with the molecule definitions.
-
-        vector<vector<int> > molecules = context.getMolecules();
-        numMolecules = molecules.size();
-        moleculeAtoms.initialize<int>(cu, cu.getNumAtoms(), "moleculeAtoms");
-        moleculeStartIndex.initialize<int>(cu, numMolecules+1, "moleculeStartIndex");
-        vector<int> atoms(moleculeAtoms.getSize());
-        vector<int> startIndex(moleculeStartIndex.getSize());
-        int index = 0;
-        for (int i = 0; i < numMolecules; i++) {
-            startIndex[i] = index;
-            for (int molecule : molecules[i])
-                atoms[index++] = molecule;
-        }
-        startIndex[numMolecules] = index;
-        moleculeAtoms.upload(atoms);
-        moleculeStartIndex.upload(startIndex);
-
-        // Initialize the kernel arguments.
-        
-    }
-    int bytesToCopy = cu.getPosq().getSize()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
-    CUresult result = cuMemcpyDtoD(savedPositions.getDevicePointer(), cu.getPosq().getDevicePointer(), bytesToCopy);
-    if (result != CUDA_SUCCESS) {
-        std::stringstream m;
-        m<<"Error saving positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
-        throw OpenMMException(m.str());
-    }
-    result = cuMemcpyDtoD(savedForces.getDevicePointer(), cu.getForce().getDevicePointer(), savedForces.getSize()*savedForces.getElementSize());
-    if (result != CUDA_SUCCESS) {
-        std::stringstream m;
-        m<<"Error saving forces for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
-        throw OpenMMException(m.str());
-    }
-    float scalefX = (float) scaleX;
-    float scalefY = (float) scaleY;
-    float scalefZ = (float) scaleZ;
-    void* args[] = {&scalefX, &scalefY, &scalefZ, &numMolecules, cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(),
-                    cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
-                    &cu.getPosq().getDevicePointer(), &moleculeAtoms.getDevicePointer(), &moleculeStartIndex.getDevicePointer()};
-    cu.executeKernel(kernel, args, cu.getNumAtoms());
-    for (auto& offset : cu.getPosCellOffsets())
-        offset = mm_int4(0, 0, 0, 0);
-    lastAtomOrder = cu.getAtomIndex();
-}
-
-void CudaApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context) {
-    cu.setAsCurrent();
-    int bytesToCopy = cu.getPosq().getSize()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
-    CUresult result = cuMemcpyDtoD(cu.getPosq().getDevicePointer(), savedPositions.getDevicePointer(), bytesToCopy);
-    if (result != CUDA_SUCCESS) {
-        std::stringstream m;
-        m<<"Error restoring positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
-        throw OpenMMException(m.str());
-    }
-    result = cuMemcpyDtoD(cu.getForce().getDevicePointer(), savedForces.getDevicePointer(), savedForces.getSize()*savedForces.getElementSize());
-    if (result != CUDA_SUCCESS) {
-        std::stringstream m;
-        m<<"Error restoring forces for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
-        throw OpenMMException(m.str());
-    }
-}
--- a/platforms/cuda/src/kernels/angleForce.cu
+++ b/platforms/cuda/src/kernels/angleForce.cu
-real3 v0 = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-real3 v1 = make_real3(pos2.x-pos3.x, pos2.y-pos3.y, pos2.z-pos3.z);
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(v0)
-APPLY_PERIODIC_TO_DELTA(v1)
-#endif
-real3 cp = cross(v0, v1);
-real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
-rp = max(SQRT(rp), (real) 1.0e-06f);
-real r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
-real r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
-real dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
-real cosine = min(max(dot*RSQRT(r21*r23), (real) -1), (real) 1);
-real theta = ACOS(cosine);
-COMPUTE_FORCE
-real3 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
-real3 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
-real3 force2 = -force1-force3;
--- a/platforms/cuda/src/kernels/bondForce.cu
+++ b/platforms/cuda/src/kernels/bondForce.cu
-real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
-COMPUTE_FORCE
-dEdR = (r > 0) ? (dEdR / r) : 0;
-delta *= dEdR;
-real3 force1 = delta;
-real3 force2 = -delta;
--- a/platforms/cuda/src/kernels/torsionForce.cu
+++ b/platforms/cuda/src/kernels/torsionForce.cu
-const real PI = (real) 3.14159265358979323846;
-real3 v0 = make_real3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
-real3 v1 = make_real3(pos3.x-pos2.x, pos3.y-pos2.y, pos3.z-pos2.z);
-real3 v2 = make_real3(pos3.x-pos4.x, pos3.y-pos4.y, pos3.z-pos4.z);
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(v0)
-APPLY_PERIODIC_TO_DELTA(v1)
-APPLY_PERIODIC_TO_DELTA(v2)
-#endif
-real3 cp0 = cross(v0, v1);
-real3 cp1 = cross(v1, v2);
-real cosangle = dot(normalize(cp0), normalize(cp1));
-real theta;
-if (cosangle > 0.99f || cosangle < -0.99f) {
-    // We're close to the singularity in acos(), so take the cross product and use asin() instead.
-
-    real3 cross_prod = cross(cp0, cp1);
-    real scale = dot(cp0, cp0)*dot(cp1, cp1);
-    theta = ASIN(SQRT(dot(cross_prod, cross_prod)/scale));
-    if (cosangle < 0)
-        theta = PI-theta;
-}
-else
-   theta = ACOS(cosangle);
-theta = (dot(v0, cp1) >= 0 ? theta : -theta);
-COMPUTE_FORCE
-real normCross1 = dot(cp0, cp0);
-real normSqrBC = dot(v1, v1);
-real normBC = SQRT(normSqrBC);
-real normCross2 = dot(cp1, cp1);
-real dp = RECIP(normSqrBC);
-real4 ff = make_real4((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
-real3 force1 = ff.x*cp0;
-real3 force4 = ff.w*cp1;
-real3 s = ff.y*force1 - ff.z*force4;
-real3 force2 = s-force1;
-real3 force3 = -s-force4;
--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -255,6 +255,13 @@ public:
    OpenCLArray& getForceBuffers() {
        return forceBuffers;
    }
+    /**
+     * Get the array which contains a contribution to each force represented as a real4.
+     * This is a synonym for getForce().  It exists to satisfy the ComputeContext interface.
+     */
+    ArrayInterface& getFloatForceBuffer() {
+        return force;
+    }
    /**
     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
     */

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -31,14 +31,10 @@
 #include "OpenCLArray.h"
 #include "OpenCLContext.h"
 #include "OpenCLFFT3D.h"
-#include "OpenCLParameterSet.h"
 #include "OpenCLSort.h"
 #include "openmm/kernels.h"
-#include "openmm/internal/CompiledExpressionSet.h"
-#include "openmm/internal/CustomIntegratorUtilities.h"
-#include "lepton/CompiledExpression.h"
-#include "lepton/ExpressionProgram.h"
 #include "openmm/System.h"
+#include "openmm/common/CommonKernels.h"

 namespace OpenMM {

@@ -180,63 +176,6 @@ private:
    OpenCLContext& cl;
 };

-/**
- * This kernel modifies the positions of particles to enforce distance constraints.
- */
-class OpenCLApplyConstraintsKernel : public ApplyConstraintsKernel {
-public:
-    OpenCLApplyConstraintsKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyConstraintsKernel(name, platform),
-            cl(cl), hasInitializedKernel(false) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * Update particle positions to enforce constraints.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param tol        the distance tolerance within which constraints must be satisfied.
-     */
-    void apply(ContextImpl& context, double tol);
-    /**
-     * Update particle velocities to enforce constraints.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param tol        the velocity tolerance within which constraints must be satisfied.
-     */
-    void applyToVelocities(ContextImpl& context, double tol);
-private:
-    OpenCLContext& cl;
-    bool hasInitializedKernel;
-    cl::Kernel applyDeltasKernel;
-};
-
-/**
- * This kernel recomputes the positions of virtual sites.
- */
-class OpenCLVirtualSitesKernel : public VirtualSitesKernel {
-public:
-    OpenCLVirtualSitesKernel(std::string name, const Platform& platform, OpenCLContext& cl) : VirtualSitesKernel(name, platform), cl(cl) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * Compute the virtual site locations.
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void computePositions(ContextImpl& context);
-private:
-    OpenCLContext& cl;
-};
-
 /**
 * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
 */
@@ -376,103 +315,13 @@ private:
 /**
 * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
 */
-class OpenCLCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
+class OpenCLCalcCustomCVForceKernel : public CommonCalcCustomCVForceKernel {
 public:
-    OpenCLCalcCustomCVForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcCustomCVForceKernel(name, platform),
-            cl(cl), hasInitializedKernels(false) {
+    OpenCLCalcCustomCVForceKernel(std::string name, const Platform& platform, ComputeContext& cc) : CommonCalcCustomCVForceKernel(name, platform, cc) {
    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomCVForce this kernel will be used for
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     */
-    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
-    /**
-     * Copy state information to the inner context.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     */
-    void copyState(ContextImpl& context, ContextImpl& innerContext);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomCVForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomCVForce& force);
-private:
-    class ForceInfo;
-    class ReorderListener;
-    OpenCLContext& cl;
-    bool hasInitializedKernels;
-    Lepton::ExpressionProgram energyExpression;
-    std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
-    std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
-    std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
-    std::vector<OpenCLArray> cvForces;
-    OpenCLArray invAtomOrder;
-    OpenCLArray innerInvAtomOrder;
-    cl::Kernel copyStateKernel, copyForcesKernel, addForcesKernel;
-};
-
-/**
- * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
- */
-class OpenCLApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
-public:
-    OpenCLApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
-            hasInitializedKernels(false) {
+    ComputeContext& getInnerComputeContext(ContextImpl& innerContext) {
+        return *reinterpret_cast<OpenCLPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param barostat   the MonteCarloBarostat this kernel will be used for
-     */
-    void initialize(const System& system, const Force& barostat);
-    /**
-     * Attempt a Monte Carlo step, scaling particle positions (or cluster centers) by a specified value.
-     * This version scales the x, y, and z positions independently.
-     * This is called BEFORE the periodic box size is modified.  It should begin by translating each particle
-     * or cluster into the first periodic box, so that coordinates will still be correct after the box size
-     * is changed.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param scaleX     the scale factor by which to multiply particle x-coordinate
-     * @param scaleY     the scale factor by which to multiply particle y-coordinate
-     * @param scaleZ     the scale factor by which to multiply particle z-coordinate
-     */
-    void scaleCoordinates(ContextImpl& context, double scaleX, double scaleY, double scaleZ);
-    /**
-     * Reject the most recent Monte Carlo step, restoring the particle positions to where they were before
-     * scaleCoordinates() was last called.
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void restoreCoordinates(ContextImpl& context);
-private:
-    OpenCLContext& cl;
-    bool hasInitializedKernels;
-    int numMolecules;
-    OpenCLArray savedPositions;
-    OpenCLArray savedForces;
-    OpenCLArray moleculeAtoms;
-    OpenCLArray moleculeStartIndex;
-    cl::Kernel kernel;
-    std::vector<int> lastAtomOrder;
 };

 } // namespace OpenMM