Continuing to implement new CUDA platform: CustomNonbondedForce, CustomHbondForce, CustomIntegrator

bd22eada · Peter Eastman · 8eb6850d · bd22eada · bd22eada · bd22eada
Commit bd22eada authored Jun 20, 2012 by Peter Eastman
16 changed files
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
@@ -53,7 +53,7 @@ CudaArray::~CudaArray() {
    }
 }

-void CudaArray::upload(void* data, bool blocking) {
+void CudaArray::upload(const void* data, bool blocking) {
    CUresult result;
    if (blocking)
        result = cuMemcpyHtoD(pointer, data, size*elementSize);

--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
@@ -94,7 +94,7 @@ public:
     * Copy the values in a vector to the device memory.
     */
    template <class T>
-    void upload(std::vector<T>& data) {
+    void upload(const std::vector<T>& data) {
        if (sizeof(T) != elementSize || data.size() != size)
            throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
        upload(&data[0], true);
@@ -117,7 +117,7 @@ public:
     * @param blocking if true, this call will block until the transfer is complete.  If false,
     *                 the source array  must be in page-locked memory.
     */
-    void upload(void* data, bool blocking = true);
+    void upload(const void* data, bool blocking = true);
    /**
     * Copy the values in the device memory to an array.
     * 

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -945,6 +945,10 @@ void CudaContext::reorderAtoms(bool enforcePeriodic) {
        reorderListeners[i]->execute();
 }

+void CudaContext::addReorderListener(ReorderListener* listener) {
+    reorderListeners.push_back(listener);
+}
+
 struct CudaContext::WorkThread::ThreadData {
    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -94,16 +94,16 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcNonbondedForceKernel::Name())
        return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
-//    if (name == CalcCustomNonbondedForceKernel::Name())
-//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomNonbondedForceKernel::Name())
+        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcGBSAOBCForceKernel::Name())
 //        return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
 //    if (name == CalcCustomGBForceKernel::Name())
 //        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomExternalForceKernel::Name())
        return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
-//    if (name == CalcCustomHbondForceKernel::Name())
-//        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomHbondForceKernel::Name())
+        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCompoundBondForceKernel::Name())
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
    if (name == IntegrateVerletStepKernel::Name())
@@ -116,8 +116,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
    if (name == IntegrateVariableLangevinStepKernel::Name())
        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
-//    if (name == IntegrateCustomStepKernel::Name())
-//        return new CudaIntegrateCustomStepKernel(name, platform, cu);
+    if (name == IntegrateCustomStepKernel::Name())
+        return new CudaIntegrateCustomStepKernel(name, platform, cu);
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CudaApplyAndersenThermostatKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -1669,277 +1669,277 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    cu.invalidateMolecules();
 }

-//class CudaCustomNonbondedForceInfo : public CudaForceInfo {
+class CudaCustomNonbondedForceInfo : public CudaForceInfo {
+public:
+    CudaCustomNonbondedForceInfo(const CustomNonbondedForce& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        vector<double> params1;
+        vector<double> params2;
+        force.getParticleParameters(particle1, params1);
+        force.getParticleParameters(particle2, params2);
+        for (int i = 0; i < (int) params1.size(); i++)
+            if (params1[i] != params2[i])
+                return false;
+        return true;
+    }
+    int getNumParticleGroups() {
+        return force.getNumExclusions();
+    }
+    void getParticlesInGroup(int index, vector<int>& particles) {
+        int particle1, particle2;
+        force.getExclusionParticles(index, particle1, particle2);
+        particles.resize(2);
+        particles[0] = particle1;
+        particles[1] = particle2;
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        return true;
+    }
+private:
+    const CustomNonbondedForce& force;
+};
+
+CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
+    cu.setAsCurrent();
+    if (params != NULL)
+        delete params;
+    if (globals != NULL)
+        delete globals;
+    if (tabulatedFunctionParams != NULL)
+        delete tabulatedFunctionParams;
+    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+        delete tabulatedFunctions[i];
+}
+
+void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
+    cu.setAsCurrent();
+    int forceIndex;
+    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
+        ;
+    string prefix = "custom"+cu.intToString(forceIndex)+"_";
+
+    // Record parameters and exclusions.
+
+    int numParticles = force.getNumParticles();
+    params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), numParticles, "customNonbondedParameters");
+    if (force.getNumGlobalParameters() > 0)
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customNonbondedGlobals");
+    vector<vector<float> > paramVector(numParticles);
+    vector<vector<int> > exclusionList(numParticles);
+    for (int i = 0; i < numParticles; i++) {
+        vector<double> parameters;
+        force.getParticleParameters(i, parameters);
+        paramVector[i].resize(parameters.size());
+        for (int j = 0; j < (int) parameters.size(); j++)
+            paramVector[i][j] = (float) parameters[j];
+        exclusionList[i].push_back(i);
+    }
+    for (int i = 0; i < force.getNumExclusions(); i++) {
+        int particle1, particle2;
+        force.getExclusionParticles(i, particle1, particle2);
+        exclusionList[particle1].push_back(particle2);
+        exclusionList[particle2].push_back(particle1);
+    }
+    params->setParameterValues(paramVector);
+
+    // Record the tabulated functions.
+
+    CudaExpressionUtilities::FunctionPlaceholder fp;
+    map<string, Lepton::CustomFunction*> functions;
+    vector<pair<string, string> > functionDefinitions;
+    vector<float4> tabulatedFunctionParamsVec(force.getNumFunctions());
+    for (int i = 0; i < force.getNumFunctions(); i++) {
+        string name;
+        vector<double> values;
+        double min, max;
+        force.getFunctionParameters(i, name, values, min, max);
+        string arrayName = prefix+"table"+cu.intToString(i);
+        functionDefinitions.push_back(make_pair(name, arrayName));
+        functions[name] = &fp;
+        tabulatedFunctionParamsVec[i] = make_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
+        vector<float4> f = cu.getExpressionUtilities().computeFunctionCoefficients(values, min, max);
+        tabulatedFunctions.push_back(CudaArray::create<float4>(cu, values.size()-1, "TabulatedFunction"));
+        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
+        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDevicePointer()));
+    }
+    if (force.getNumFunctions() > 0) {
+        tabulatedFunctionParams = CudaArray::create<float4>(cu, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters");
+        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
+        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(float4), tabulatedFunctionParams->getDevicePointer()));
+    }
+
+    // Record information for the expressions.
+
+    globalParamNames.resize(force.getNumGlobalParameters());
+    globalParamValues.resize(force.getNumGlobalParameters());
+    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        globalParamNames[i] = force.getGlobalParameterName(i);
+        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
+    }
+    if (globals != NULL)
+        globals->upload(globalParamValues);
+    bool useCutoff = (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff);
+    bool usePeriodic = (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff && force.getNonbondedMethod() != CustomNonbondedForce::CutoffNonPeriodic);
+    Lepton::ParsedExpression energyExpression = Lepton::Parser::parse(force.getEnergyFunction(), functions).optimize();
+    Lepton::ParsedExpression forceExpression = energyExpression.differentiate("r").optimize();
+    map<string, Lepton::ParsedExpression> forceExpressions;
+    forceExpressions["tempEnergy += "] = energyExpression;
+    forceExpressions["tempForce -= "] = forceExpression;
+
+    // Create the kernels.
+
+    vector<pair<ExpressionTreeNode, string> > variables;
+    ExpressionTreeNode rnode(new Operation::Variable("r"));
+    variables.push_back(make_pair(rnode, "r"));
+    variables.push_back(make_pair(ExpressionTreeNode(new Operation::Square(), rnode), "r2"));
+    variables.push_back(make_pair(ExpressionTreeNode(new Operation::Reciprocal(), rnode), "invR"));
+    for (int i = 0; i < force.getNumPerParticleParameters(); i++) {
+        const string& name = force.getPerParticleParameterName(i);
+        variables.push_back(makeVariable(name+"1", prefix+"params"+params->getParameterSuffix(i, "1")));
+        variables.push_back(makeVariable(name+"2", prefix+"params"+params->getParameterSuffix(i, "2")));
+    }
+    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        const string& name = force.getGlobalParameterName(i);
+        string value = "globals["+cu.intToString(i)+"]";
+        variables.push_back(makeVariable(name, prefix+value));
+    }
+    stringstream compute;
+    compute << cu.getExpressionUtilities().createExpressions(forceExpressions, variables, functionDefinitions, prefix+"temp", prefix+"functionParams");
+    map<string, string> replacements;
+    replacements["COMPUTE_FORCE"] = compute.str();
+    string source = cu.replaceStrings(CudaKernelSources::customNonbonded, replacements);
+    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+        CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"params"+cu.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
+    }
+    if (globals != NULL) {
+        globals->upload(globalParamValues);
+        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(float), globals->getDevicePointer()));
+    }
+    cu.addForce(new CudaCustomNonbondedForceInfo(force));
+}
+
+double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (globals != NULL) {
+        bool changed = false;
+        for (int i = 0; i < (int) globalParamNames.size(); i++) {
+            float value = (float) context.getParameter(globalParamNames[i]);
+            if (value != globalParamValues[i])
+                changed = true;
+            globalParamValues[i] = value;
+        }
+        if (changed)
+            globals->upload(globalParamValues);
+    }
+    return 0.0;
+}
+
+void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
+    cu.setAsCurrent();
+    int numParticles = force.getNumParticles();
+    if (numParticles != cu.getNumAtoms())
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
+    
+    // Record the per-particle parameters.
+    
+    vector<vector<float> > paramVector(numParticles);
+    vector<double> parameters;
+    for (int i = 0; i < numParticles; i++) {
+        force.getParticleParameters(i, parameters);
+        paramVector[i].resize(parameters.size());
+        for (int j = 0; j < (int) parameters.size(); j++)
+            paramVector[i][j] = (float) parameters[j];
+    }
+    params->setParameterValues(paramVector);
+    
+    // Mark that the current reordering may be invalid.
+    
+    cu.invalidateMolecules();
+}
+
+//class CudaGBSAOBCForceInfo : public CudaForceInfo {
 //public:
-//    CudaCustomNonbondedForceInfo(int requiredBuffers, const CustomNonbondedForce& force) : CudaForceInfo(requiredBuffers), force(force) {
+//    CudaGBSAOBCForceInfo(int requiredBuffers, const GBSAOBCForce& force) : CudaForceInfo(requiredBuffers), force(force) {
 //    }
 //    bool areParticlesIdentical(int particle1, int particle2) {
-//        vector<double> params1;
-//        vector<double> params2;
-//        force.getParticleParameters(particle1, params1);
-//        force.getParticleParameters(particle2, params2);
-//        for (int i = 0; i < (int) params1.size(); i++)
-//            if (params1[i] != params2[i])
-//                return false;
-//        return true;
-//    }
-//    int getNumParticleGroups() {
-//        return force.getNumExclusions();
-//    }
-//    void getParticlesInGroup(int index, vector<int>& particles) {
-//        int particle1, particle2;
-//        force.getExclusionParticles(index, particle1, particle2);
-//        particles.resize(2);
-//        particles[0] = particle1;
-//        particles[1] = particle2;
-//    }
-//    bool areGroupsIdentical(int group1, int group2) {
-//        return true;
+//        double charge1, charge2, radius1, radius2, scale1, scale2;
+//        force.getParticleParameters(particle1, charge1, radius1, scale1);
+//        force.getParticleParameters(particle2, charge2, radius2, scale2);
+//        return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
 //    }
 //private:
-//    const CustomNonbondedForce& force;
+//    const GBSAOBCForce& force;
 //};
 //
-//CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
+//CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
 //    cu.setAsCurrent();
 //    if (params != NULL)
 //        delete params;
-//    if (globals != NULL)
-//        delete globals;
-//    if (tabulatedFunctionParams != NULL)
-//        delete tabulatedFunctionParams;
-//    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//        delete tabulatedFunctions[i];
+//    if (bornSum != NULL)
+//        delete bornSum;
+//    if (longBornSum != NULL)
+//        delete longBornSum;
+//    if (bornRadii != NULL)
+//        delete bornRadii;
+//    if (bornForce != NULL)
+//        delete bornForce;
+//    if (longBornForce != NULL)
+//        delete longBornForce;
+//    if (obcChain != NULL)
+//        delete obcChain;
 //}
 //
-//void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
+//void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
 //    cu.setAsCurrent();
-//    int forceIndex;
-//    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
-//        ;
-//    string prefix = "custom"+cu.intToString(forceIndex)+"_";
-//
-//    // Record parameters and exclusions.
-//
+//    if (cu.getPlatformData().contexts.size() > 1)
+//        throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices");
+//    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+//    params = new CudaArray<mm_float2>(cu, cu.getPaddedNumAtoms(), "gbsaObcParams");
+//    bornRadii = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "bornRadii");
+//    obcChain = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "obcChain");
+//    if (cu.getSupports64BitGlobalAtomics()) {
+//        longBornSum = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "longBornSum");
+//        longBornForce = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "longBornForce");
+//        bornForce = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "bornForce");
+//        cu.addAutoclearBuffer(longBornSum->getDevicePointer(), 2*longBornSum->getSize());
+//        cu.addAutoclearBuffer(longBornForce->getDevicePointer(), 2*longBornForce->getSize());
+//    }
+//    else {
+//        bornSum = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornSum");
+//        bornForce = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornForce");
+//        cu.addAutoclearBuffer(bornSum->getDevicePointer(), bornSum->getSize());
+//        cu.addAutoclearBuffer(bornForce->getDevicePointer(), bornForce->getSize());
+//    }
+//    CudaArray<mm_float4>& posq = cu.getPosq();
 //    int numParticles = force.getNumParticles();
-//    params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), numParticles, "customNonbondedParameters");
-//    if (force.getNumGlobalParameters() > 0)
-//        globals = new CudaArray<cl_float>(cu, force.getNumGlobalParameters(), "customNonbondedGlobals", false, CL_MEM_READ_ONLY);
-//    vector<vector<cl_float> > paramVector(numParticles);
-//    vector<vector<int> > exclusionList(numParticles);
+//    vector<mm_float2> paramsVector(numParticles);
+//    const double dielectricOffset = 0.009;
 //    for (int i = 0; i < numParticles; i++) {
-//        vector<double> parameters;
-//        force.getParticleParameters(i, parameters);
-//        paramVector[i].resize(parameters.size());
-//        for (int j = 0; j < (int) parameters.size(); j++)
-//            paramVector[i][j] = (cl_float) parameters[j];
-//        exclusionList[i].push_back(i);
-//    }
-//    for (int i = 0; i < force.getNumExclusions(); i++) {
-//        int particle1, particle2;
-//        force.getExclusionParticles(i, particle1, particle2);
-//        exclusionList[particle1].push_back(particle2);
-//        exclusionList[particle2].push_back(particle1);
+//        double charge, radius, scalingFactor;
+//        force.getParticleParameters(i, charge, radius, scalingFactor);
+//        radius -= dielectricOffset;
+//        paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
+//        posq[i].w = (float) charge;
 //    }
-//    params->setParameterValues(paramVector);
+//    posq.upload();
+//    params->upload(paramsVector);
+//    prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
+//    bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
+//    bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
+//    string source = CudaKernelSources::gbsaObc2;
+//    nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
+//    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDevicePointer()));;
+//    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDevicePointer()));;
+//    cu.addForce(new CudaGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
+//}
 //
-//    // Record the tabulated functions.
-//
-//    CudaExpressionUtilities::FunctionPlaceholder fp;
-//    map<string, Lepton::CustomFunction*> functions;
-//    vector<pair<string, string> > functionDefinitions;
-//    vector<mm_float4> tabulatedFunctionParamsVec(force.getNumFunctions());
-//    for (int i = 0; i < force.getNumFunctions(); i++) {
-//        string name;
-//        vector<double> values;
-//        double min, max;
-//        force.getFunctionParameters(i, name, values, min, max);
-//        string arrayName = prefix+"table"+cu.intToString(i);
-//        functionDefinitions.push_back(make_pair(name, arrayName));
-//        functions[name] = &fp;
-//        tabulatedFunctionParamsVec[i] = mm_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
-//        vector<mm_float4> f = cu.getExpressionUtilities().computeFunctionCoefficients(values, min, max);
-//        tabulatedFunctions.push_back(new CudaArray<mm_float4>(cu, values.size()-1, "TabulatedFunction"));
-//        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
-//        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDevicePointer()));
-//    }
-//    if (force.getNumFunctions() > 0) {
-//        tabulatedFunctionParams = new CudaArray<mm_float4>(cu, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
-//        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
-//        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(cl_float4), tabulatedFunctionParams->getDevicePointer()));
-//    }
-//
-//    // Record information for the expressions.
-//
-//    globalParamNames.resize(force.getNumGlobalParameters());
-//    globalParamValues.resize(force.getNumGlobalParameters());
-//    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-//        globalParamNames[i] = force.getGlobalParameterName(i);
-//        globalParamValues[i] = (cl_float) force.getGlobalParameterDefaultValue(i);
-//    }
-//    if (globals != NULL)
-//        globals->upload(globalParamValues);
-//    bool useCutoff = (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff);
-//    bool usePeriodic = (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff && force.getNonbondedMethod() != CustomNonbondedForce::CutoffNonPeriodic);
-//    Lepton::ParsedExpression energyExpression = Lepton::Parser::parse(force.getEnergyFunction(), functions).optimize();
-//    Lepton::ParsedExpression forceExpression = energyExpression.differentiate("r").optimize();
-//    map<string, Lepton::ParsedExpression> forceExpressions;
-//    forceExpressions["tempEnergy += "] = energyExpression;
-//    forceExpressions["tempForce -= "] = forceExpression;
-//
-//    // Create the kernels.
-//
-//    vector<pair<ExpressionTreeNode, string> > variables;
-//    ExpressionTreeNode rnode(new Operation::Variable("r"));
-//    variables.push_back(make_pair(rnode, "r"));
-//    variables.push_back(make_pair(ExpressionTreeNode(new Operation::Square(), rnode), "r2"));
-//    variables.push_back(make_pair(ExpressionTreeNode(new Operation::Reciprocal(), rnode), "invR"));
-//    for (int i = 0; i < force.getNumPerParticleParameters(); i++) {
-//        const string& name = force.getPerParticleParameterName(i);
-//        variables.push_back(makeVariable(name+"1", prefix+"params"+params->getParameterSuffix(i, "1")));
-//        variables.push_back(makeVariable(name+"2", prefix+"params"+params->getParameterSuffix(i, "2")));
-//    }
-//    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-//        const string& name = force.getGlobalParameterName(i);
-//        string value = "globals["+cu.intToString(i)+"]";
-//        variables.push_back(makeVariable(name, prefix+value));
-//    }
-//    stringstream compute;
-//    compute << cu.getExpressionUtilities().createExpressions(forceExpressions, variables, functionDefinitions, prefix+"temp", prefix+"functionParams");
-//    map<string, string> replacements;
-//    replacements["COMPUTE_FORCE"] = compute.str();
-//    string source = cu.replaceStrings(CudaKernelSources::customNonbonded, replacements);
-//    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
-//    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//        const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
-//        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"params"+cu.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
-//    }
-//    if (globals != NULL) {
-//        globals->upload(globalParamValues);
-//        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDevicePointer()));
-//    }
-//    cu.addForce(new CudaCustomNonbondedForceInfo(cu.getNonbondedUtilities().getNumForceBuffers(), force));
-//}
-//
-//double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-//    if (globals != NULL) {
-//        bool changed = false;
-//        for (int i = 0; i < (int) globalParamNames.size(); i++) {
-//            cl_float value = (cl_float) context.getParameter(globalParamNames[i]);
-//            if (value != globalParamValues[i])
-//                changed = true;
-//            globalParamValues[i] = value;
-//        }
-//        if (changed)
-//            globals->upload(globalParamValues);
-//    }
-//    return 0.0;
-//}
-//
-//void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
-//    cu.setAsCurrent();
-//    int numParticles = force.getNumParticles();
-//    if (numParticles != cu.getNumAtoms())
-//        throw OpenMMException("updateParametersInContext: The number of particles has changed");
-//    
-//    // Record the per-particle parameters.
-//    
-//    vector<vector<cl_float> > paramVector(numParticles);
-//    vector<double> parameters;
-//    for (int i = 0; i < numParticles; i++) {
-//        force.getParticleParameters(i, parameters);
-//        paramVector[i].resize(parameters.size());
-//        for (int j = 0; j < (int) parameters.size(); j++)
-//            paramVector[i][j] = (cl_float) parameters[j];
-//    }
-//    params->setParameterValues(paramVector);
-//    
-//    // Mark that the current reordering may be invalid.
-//    
-//    cu.invalidateMolecules();
-//}
-//
-//class CudaGBSAOBCForceInfo : public CudaForceInfo {
-//public:
-//    CudaGBSAOBCForceInfo(int requiredBuffers, const GBSAOBCForce& force) : CudaForceInfo(requiredBuffers), force(force) {
-//    }
-//    bool areParticlesIdentical(int particle1, int particle2) {
-//        double charge1, charge2, radius1, radius2, scale1, scale2;
-//        force.getParticleParameters(particle1, charge1, radius1, scale1);
-//        force.getParticleParameters(particle2, charge2, radius2, scale2);
-//        return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
-//    }
-//private:
-//    const GBSAOBCForce& force;
-//};
-//
-//CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
-//    cu.setAsCurrent();
-//    if (params != NULL)
-//        delete params;
-//    if (bornSum != NULL)
-//        delete bornSum;
-//    if (longBornSum != NULL)
-//        delete longBornSum;
-//    if (bornRadii != NULL)
-//        delete bornRadii;
-//    if (bornForce != NULL)
-//        delete bornForce;
-//    if (longBornForce != NULL)
-//        delete longBornForce;
-//    if (obcChain != NULL)
-//        delete obcChain;
-//}
-//
-//void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
-//    cu.setAsCurrent();
-//    if (cu.getPlatformData().contexts.size() > 1)
-//        throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices");
-//    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
-//    params = new CudaArray<mm_float2>(cu, cu.getPaddedNumAtoms(), "gbsaObcParams");
-//    bornRadii = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "bornRadii");
-//    obcChain = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "obcChain");
-//    if (cu.getSupports64BitGlobalAtomics()) {
-//        longBornSum = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "longBornSum");
-//        longBornForce = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "longBornForce");
-//        bornForce = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "bornForce");
-//        cu.addAutoclearBuffer(longBornSum->getDevicePointer(), 2*longBornSum->getSize());
-//        cu.addAutoclearBuffer(longBornForce->getDevicePointer(), 2*longBornForce->getSize());
-//    }
-//    else {
-//        bornSum = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornSum");
-//        bornForce = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornForce");
-//        cu.addAutoclearBuffer(bornSum->getDevicePointer(), bornSum->getSize());
-//        cu.addAutoclearBuffer(bornForce->getDevicePointer(), bornForce->getSize());
-//    }
-//    CudaArray<mm_float4>& posq = cu.getPosq();
-//    int numParticles = force.getNumParticles();
-//    vector<mm_float2> paramsVector(numParticles);
-//    const double dielectricOffset = 0.009;
-//    for (int i = 0; i < numParticles; i++) {
-//        double charge, radius, scalingFactor;
-//        force.getParticleParameters(i, charge, radius, scalingFactor);
-//        radius -= dielectricOffset;
-//        paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
-//        posq[i].w = (float) charge;
-//    }
-//    posq.upload();
-//    params->upload(paramsVector);
-//    prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
-//    bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
-//    bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
-//    string source = CudaKernelSources::gbsaObc2;
-//    nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
-//    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDevicePointer()));;
-//    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDevicePointer()));;
-//    cu.addForce(new CudaGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
-//}
-//
-//double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-//    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
-//    bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
-//    if (!hasCreatedKernels) {
-//        // These Kernels cannot be created in initialize(), because the CudaNonbondedUtilities has not been initialized yet then.
+//double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+//    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+//    bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
+//    if (!hasCreatedKernels) {
+//        // These Kernels cannot be created in initialize(), because the CudaNonbondedUtilities has not been initialized yet then.
 //
 //        hasCreatedKernels = true;
 //        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
@@ -3172,557 +3172,524 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
    cu.invalidateMolecules();
 }

-//class CudaCustomHbondForceInfo : public CudaForceInfo {
-//public:
-//    CudaCustomHbondForceInfo(int requiredBuffers, const CustomHbondForce& force) : CudaForceInfo(requiredBuffers), force(force) {
-//    }
-//    bool areParticlesIdentical(int particle1, int particle2) {
-//        return true;
-//    }
-//    int getNumParticleGroups() {
-//        return force.getNumDonors()+force.getNumAcceptors()+force.getNumExclusions();
-//    }
-//    void getParticlesInGroup(int index, vector<int>& particles) {
-//        int p1, p2, p3;
-//        vector<double> parameters;
-//        if (index < force.getNumDonors()) {
-//            force.getDonorParameters(index, p1, p2, p3, parameters);
-//            particles.clear();
-//            particles.push_back(p1);
-//            if (p2 > -1)
-//                particles.push_back(p2);
-//            if (p3 > -1)
-//                particles.push_back(p3);
-//            return;
-//        }
-//        index -= force.getNumDonors();
-//        if (index < force.getNumAcceptors()) {
-//            force.getAcceptorParameters(index, p1, p2, p3, parameters);
-//            particles.clear();
-//            particles.push_back(p1);
-//            if (p2 > -1)
-//                particles.push_back(p2);
-//            if (p3 > -1)
-//                particles.push_back(p3);
-//            return;
-//        }
-//        index -= force.getNumAcceptors();
-//        int donor, acceptor;
-//        force.getExclusionParticles(index, donor, acceptor);
-//        particles.clear();
-//        force.getDonorParameters(donor, p1, p2, p3, parameters);
-//        particles.push_back(p1);
-//        if (p2 > -1)
-//            particles.push_back(p2);
-//        if (p3 > -1)
-//            particles.push_back(p3);
-//        force.getAcceptorParameters(acceptor, p1, p2, p3, parameters);
-//        particles.push_back(p1);
-//        if (p2 > -1)
-//            particles.push_back(p2);
-//        if (p3 > -1)
-//            particles.push_back(p3);
-//    }
-//    bool areGroupsIdentical(int group1, int group2) {
-//        int p1, p2, p3;
-//        vector<double> params1, params2;
-//        if (group1 < force.getNumDonors() && group2 < force.getNumDonors()) {
-//            force.getDonorParameters(group1, p1, p2, p3, params1);
-//            force.getDonorParameters(group2, p1, p2, p3, params2);
-//            return (params1 == params2 && params1 == params2);
-//        }
-//        if (group1 < force.getNumDonors() || group2 < force.getNumDonors())
-//            return false;
-//        group1 -= force.getNumDonors();
-//        group2 -= force.getNumDonors();
-//        if (group1 < force.getNumAcceptors() && group2 < force.getNumAcceptors()) {
-//            force.getAcceptorParameters(group1, p1, p2, p3, params1);
-//            force.getAcceptorParameters(group2, p1, p2, p3, params2);
-//            return (params1 == params2 && params1 == params2);
-//        }
-//        if (group1 < force.getNumAcceptors() || group2 < force.getNumAcceptors())
-//            return false;
-//        return true;
-//    }
-//private:
-//    const CustomHbondForce& force;
-//};
-//
-//CudaCalcCustomHbondForceKernel::~CudaCalcCustomHbondForceKernel() {
-//    cu.setAsCurrent();
-//    if (donorParams != NULL)
-//        delete donorParams;
-//    if (acceptorParams != NULL)
-//        delete acceptorParams;
-//    if (donors != NULL)
-//        delete donors;
-//    if (acceptors != NULL)
-//        delete acceptors;
-//    if (donorBufferIndices != NULL)
-//        delete donorBufferIndices;
-//    if (acceptorBufferIndices != NULL)
-//        delete acceptorBufferIndices;
-//    if (globals != NULL)
-//        delete globals;
-//    if (donorExclusions != NULL)
-//        delete donorExclusions;
-//    if (acceptorExclusions != NULL)
-//        delete acceptorExclusions;
-//    if (tabulatedFunctionParams != NULL)
-//        delete tabulatedFunctionParams;
-//    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//        delete tabulatedFunctions[i];
-//}
-//
-//static void addDonorAndAcceptorCode(stringstream& computeDonor, stringstream& computeAcceptor, const string& value) {
-//    computeDonor << value;
-//    computeAcceptor << value;
-//}
-//
-//static void applyDonorAndAcceptorForces(stringstream& applyToDonor, stringstream& applyToAcceptor, int atom, const string& value) {
-//    string forceNames[] = {"f1", "f2", "f3"};
-//    if (atom < 3)
-//        applyToAcceptor << forceNames[atom]<<".xyz += "<<value<<";\n";
-//    else
-//        applyToDonor << forceNames[atom-3]<<".xyz += "<<value<<";\n";
-//}
-//
-//void CudaCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
-//    // Record the lists of donors and acceptors, and the parameters for each one.
-//
-//    cu.setAsCurrent();
-//    int numContexts = cu.getPlatformData().contexts.size();
-//    int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts;
-//    int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts;
-//    numDonors = endIndex-startIndex;
-//    numAcceptors = force.getNumAcceptors();
-//    if (numDonors == 0 || numAcceptors == 0)
-//        return;
-//    int numParticles = system.getNumParticles();
-//    donors = new CudaArray<mm_int4>(cu, numDonors, "customHbondDonors");
-//    acceptors = new CudaArray<mm_int4>(cu, numAcceptors, "customHbondAcceptors");
-//    donorParams = new CudaParameterSet(cu, force.getNumPerDonorParameters(), numDonors, "customHbondDonorParameters");
-//    acceptorParams = new CudaParameterSet(cu, force.getNumPerAcceptorParameters(), numAcceptors, "customHbondAcceptorParameters");
-//    if (force.getNumGlobalParameters() > 0)
-//        globals = new CudaArray<cl_float>(cu, force.getNumGlobalParameters(), "customHbondGlobals", false, CL_MEM_READ_ONLY);
-//    vector<vector<cl_float> > donorParamVector(numDonors);
-//    vector<mm_int4> donorVector(numDonors);
-//    for (int i = 0; i < numDonors; i++) {
-//        vector<double> parameters;
-//        force.getDonorParameters(startIndex+i, donorVector[i].x, donorVector[i].y, donorVector[i].z, parameters);
-//        donorParamVector[i].resize(parameters.size());
-//        for (int j = 0; j < (int) parameters.size(); j++)
-//            donorParamVector[i][j] = (cl_float) parameters[j];
-//    }
-//    donors->upload(donorVector);
-//    donorParams->setParameterValues(donorParamVector);
-//    vector<vector<cl_float> > acceptorParamVector(numAcceptors);
-//    vector<mm_int4> acceptorVector(numAcceptors);
-//    for (int i = 0; i < numAcceptors; i++) {
-//        vector<double> parameters;
-//        force.getAcceptorParameters(i, acceptorVector[i].x, acceptorVector[i].y, acceptorVector[i].z, parameters);
-//        acceptorParamVector[i].resize(parameters.size());
-//        for (int j = 0; j < (int) parameters.size(); j++)
-//            acceptorParamVector[i][j] = (cl_float) parameters[j];
-//    }
-//    acceptors->upload(acceptorVector);
-//    acceptorParams->setParameterValues(acceptorParamVector);
-//
-//    // Select an output buffer index for each donor and acceptor.
-//
-//    donorBufferIndices = new CudaArray<mm_int4>(cu, numDonors, "customHbondDonorBuffers");
-//    acceptorBufferIndices = new CudaArray<mm_int4>(cu, numAcceptors, "customHbondAcceptorBuffers");
-//    vector<mm_int4> donorBufferVector(numDonors);
-//    vector<mm_int4> acceptorBufferVector(numAcceptors);
-//    vector<int> donorBufferCounter(numParticles, 0);
-//    for (int i = 0; i < numDonors; i++)
-//        donorBufferVector[i] = mm_int4(donorVector[i].x > -1 ? donorBufferCounter[donorVector[i].x]++ : 0,
-//                                       donorVector[i].y > -1 ? donorBufferCounter[donorVector[i].y]++ : 0,
-//                                       donorVector[i].z > -1 ? donorBufferCounter[donorVector[i].z]++ : 0, 0);
-//    vector<int> acceptorBufferCounter(numParticles, 0);
-//    for (int i = 0; i < numAcceptors; i++)
-//        acceptorBufferVector[i] = mm_int4(acceptorVector[i].x > -1 ? acceptorBufferCounter[acceptorVector[i].x]++ : 0,
-//                                       acceptorVector[i].y > -1 ? acceptorBufferCounter[acceptorVector[i].y]++ : 0,
-//                                       acceptorVector[i].z > -1 ? acceptorBufferCounter[acceptorVector[i].z]++ : 0, 0);
-//    donorBufferIndices->upload(donorBufferVector);
-//    acceptorBufferIndices->upload(acceptorBufferVector);
-//    int maxBuffers = 1;
-//    for (int i = 0; i < (int) donorBufferCounter.size(); i++)
-//        maxBuffers = max(maxBuffers, donorBufferCounter[i]);
-//    for (int i = 0; i < (int) acceptorBufferCounter.size(); i++)
-//        maxBuffers = max(maxBuffers, acceptorBufferCounter[i]);
-//    cu.addForce(new CudaCustomHbondForceInfo(maxBuffers, force));
-//
-//    // Record exclusions.
-//
-//    vector<mm_int4> donorExclusionVector(numDonors, mm_int4(-1, -1, -1, -1));
-//    vector<mm_int4> acceptorExclusionVector(numAcceptors, mm_int4(-1, -1, -1, -1));
-//    for (int i = 0; i < force.getNumExclusions(); i++) {
-//        int donor, acceptor;
-//        force.getExclusionParticles(i, donor, acceptor);
-//        if (donor < startIndex || donor >= endIndex)
-//            continue;
-//        donor -= startIndex;
-//        if (donorExclusionVector[donor].x == -1)
-//            donorExclusionVector[donor].x = acceptor;
-//        else if (donorExclusionVector[donor].y == -1)
-//            donorExclusionVector[donor].y = acceptor;
-//        else if (donorExclusionVector[donor].z == -1)
-//            donorExclusionVector[donor].z = acceptor;
-//        else if (donorExclusionVector[donor].w == -1)
-//            donorExclusionVector[donor].w = acceptor;
-//        else
-//            throw OpenMMException("CustomHbondForce: CudaPlatform does not support more than four exclusions per donor");
-//        if (acceptorExclusionVector[acceptor].x == -1)
-//            acceptorExclusionVector[acceptor].x = donor;
-//        else if (acceptorExclusionVector[acceptor].y == -1)
-//            acceptorExclusionVector[acceptor].y = donor;
-//        else if (acceptorExclusionVector[acceptor].z == -1)
-//            acceptorExclusionVector[acceptor].z = donor;
-//        else if (acceptorExclusionVector[acceptor].w == -1)
-//            acceptorExclusionVector[acceptor].w = donor;
-//        else
-//            throw OpenMMException("CustomHbondForce: CudaPlatform does not support more than four exclusions per acceptor");
-//    }
-//    donorExclusions = new CudaArray<mm_int4>(cu, numDonors, "customHbondDonorExclusions");
-//    acceptorExclusions = new CudaArray<mm_int4>(cu, numDonors, "customHbondAcceptorExclusions");
-//    donorExclusions->upload(donorExclusionVector);
-//    acceptorExclusions->upload(acceptorExclusionVector);
-//
-//    // Record the tabulated functions.
-//
-//    CudaExpressionUtilities::FunctionPlaceholder fp;
-//    map<string, Lepton::CustomFunction*> functions;
-//    vector<pair<string, string> > functionDefinitions;
-//    vector<mm_float4> tabulatedFunctionParamsVec(force.getNumFunctions());
-//    stringstream tableArgs;
-//    for (int i = 0; i < force.getNumFunctions(); i++) {
-//        string name;
-//        vector<double> values;
-//        double min, max;
-//        force.getFunctionParameters(i, name, values, min, max);
-//        string arrayName = "table"+cu.intToString(i);
-//        functionDefinitions.push_back(make_pair(name, arrayName));
-//        functions[name] = &fp;
-//        tabulatedFunctionParamsVec[i] = mm_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
-//        vector<mm_float4> f = cu.getExpressionUtilities().computeFunctionCoefficients(values, min, max);
-//        tabulatedFunctions.push_back(new CudaArray<mm_float4>(cu, values.size()-1, "TabulatedFunction"));
-//        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
-//        tableArgs << ", __global const float4* restrict " << arrayName;
-//    }
-//    if (force.getNumFunctions() > 0) {
-//        tabulatedFunctionParams = new CudaArray<mm_float4>(cu, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
-//        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
-//        tableArgs << ", __global const float4* restrict functionParams";
-//    }
-//
-//    // Record information about parameters.
-//
-//    globalParamNames.resize(force.getNumGlobalParameters());
-//    globalParamValues.resize(force.getNumGlobalParameters());
-//    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-//        globalParamNames[i] = force.getGlobalParameterName(i);
-//        globalParamValues[i] = (cl_float) force.getGlobalParameterDefaultValue(i);
-//    }
-//    if (globals != NULL)
-//        globals->upload(globalParamValues);
-//    map<string, string> variables;
-//    for (int i = 0; i < force.getNumPerDonorParameters(); i++) {
-//        const string& name = force.getPerDonorParameterName(i);
-//        variables[name] = "donorParams"+donorParams->getParameterSuffix(i);
-//    }
-//    for (int i = 0; i < force.getNumPerAcceptorParameters(); i++) {
-//        const string& name = force.getPerAcceptorParameterName(i);
-//        variables[name] = "acceptorParams"+acceptorParams->getParameterSuffix(i);
-//    }
-//    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-//        const string& name = force.getGlobalParameterName(i);
-//        variables[name] = "globals["+cu.intToString(i)+"]";
-//    }
-//
-//    // Now to generate the kernel.  First, it needs to calculate all distances, angles,
-//    // and dihedrals the expression depends on.
-//
-//    map<string, vector<int> > distances;
-//    map<string, vector<int> > angles;
-//    map<string, vector<int> > dihedrals;
-//    Lepton::ParsedExpression energyExpression = CustomHbondForceImpl::prepareExpression(force, functions, distances, angles, dihedrals);
-//    map<string, Lepton::ParsedExpression> forceExpressions;
-//    set<string> computedDeltas;
-//    computedDeltas.insert("D1A1");
-//    string atomNames[] = {"A1", "A2", "A3", "D1", "D2", "D3"};
-//    string atomNamesLower[] = {"a1", "a2", "a3", "d1", "d2", "d3"};
-//    stringstream computeDonor, computeAcceptor, extraArgs;
-//    int index = 0;
-//    for (map<string, vector<int> >::const_iterator iter = distances.begin(); iter != distances.end(); ++iter, ++index) {
-//        const vector<int>& atoms = iter->second;
-//        string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]];
-//        if (computedDeltas.count(deltaName) == 0) {
-//            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 delta"+deltaName+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+");\n");
-//            computedDeltas.insert(deltaName);
-//        }
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float r_"+deltaName+" = sqrt(delta"+deltaName+".w);\n");
-//        variables[iter->first] = "r_"+deltaName;
-//        forceExpressions["float dEdDistance"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
-//    }
-//    index = 0;
-//    for (map<string, vector<int> >::const_iterator iter = angles.begin(); iter != angles.end(); ++iter, ++index) {
-//        const vector<int>& atoms = iter->second;
-//        string deltaName1 = atomNames[atoms[1]]+atomNames[atoms[0]];
-//        string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]];
-//        string angleName = "angle_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]];
-//        if (computedDeltas.count(deltaName1) == 0) {
-//            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[0]]+");\n");
-//            computedDeltas.insert(deltaName1);
-//        }
-//        if (computedDeltas.count(deltaName2) == 0) {
-//            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[2]]+");\n");
-//            computedDeltas.insert(deltaName2);
-//        }
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float "+angleName+" = computeAngle(delta"+deltaName1+", delta"+deltaName2+");\n");
-//        variables[iter->first] = angleName;
-//        forceExpressions["float dEdAngle"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
-//    }
-//    index = 0;
-//    for (map<string, vector<int> >::const_iterator iter = dihedrals.begin(); iter != dihedrals.end(); ++iter, ++index) {
-//        const vector<int>& atoms = iter->second;
-//        string deltaName1 = atomNames[atoms[0]]+atomNames[atoms[1]];
-//        string deltaName2 = atomNames[atoms[2]]+atomNames[atoms[1]];
-//        string deltaName3 = atomNames[atoms[2]]+atomNames[atoms[3]];
-//        string crossName1 = "cross_"+deltaName1+"_"+deltaName2;
-//        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
-//        string dihedralName = "dihedral_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]]+atomNames[atoms[3]];
-//        if (computedDeltas.count(deltaName1) == 0) {
-//            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+");\n");
-//            computedDeltas.insert(deltaName1);
-//        }
-//        if (computedDeltas.count(deltaName2) == 0) {
-//            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[1]]+");\n");
-//            computedDeltas.insert(deltaName2);
-//        }
-//        if (computedDeltas.count(deltaName3) == 0) {
-//            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 delta"+deltaName3+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[3]]+");\n");
-//            computedDeltas.insert(deltaName3);
-//        }
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 "+crossName1+" = computeCross(delta"+deltaName1+", delta"+deltaName2+");\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 "+crossName2+" = computeCross(delta"+deltaName2+", delta"+deltaName3+");\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float "+dihedralName+" = computeAngle("+crossName1+", "+crossName2+");\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, dihedralName+" *= (delta"+deltaName1+".x*"+crossName2+".x + delta"+deltaName1+".y*"+crossName2+".y + delta"+deltaName1+".z*"+crossName2+".z < 0 ? -1 : 1);\n");
-//        variables[iter->first] = dihedralName;
-//        forceExpressions["float dEdDihedral"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
-//    }
-//
-//    // Next it needs to load parameters from global memory.
-//
-//    if (force.getNumGlobalParameters() > 0)
-//        extraArgs << ", __global const float* restrict globals";
-//    for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
-//        const CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
-//        extraArgs << ", __global const "+buffer.getType()+"* restrict donor"+buffer.getName();
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" donorParams"+cu.intToString(i+1)+" = donor"+buffer.getName()+"[index];\n");
-//    }
-//    for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
-//        const CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
-//        extraArgs << ", __global const "+buffer.getType()+"* restrict acceptor"+buffer.getName();
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" acceptorParams"+cu.intToString(i+1)+" = acceptor"+buffer.getName()+"[index];\n");
-//    }
-//
-//    // Now evaluate the expressions.
-//
-//    computeAcceptor << cu.getExpressionUtilities().createExpressions(forceExpressions, variables, functionDefinitions, "temp", "functionParams");
-//    forceExpressions["energy += "] = energyExpression;
-//    computeDonor << cu.getExpressionUtilities().createExpressions(forceExpressions, variables, functionDefinitions, "temp", "functionParams");
-//
-//    // Finally, apply forces to atoms.
-//
-//    index = 0;
-//    for (map<string, vector<int> >::const_iterator iter = distances.begin(); iter != distances.end(); ++iter, ++index) {
-//        const vector<int>& atoms = iter->second;
-//        string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]];
-//        string value = "(dEdDistance"+cu.intToString(index)+"/r_"+deltaName+")*delta"+deltaName+".xyz";
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[0], "-"+value);
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[1], value);
-//    }
-//    index = 0;
-//    for (map<string, vector<int> >::const_iterator iter = angles.begin(); iter != angles.end(); ++iter, ++index) {
-//        const vector<int>& atoms = iter->second;
-//        string deltaName1 = atomNames[atoms[1]]+atomNames[atoms[0]];
-//        string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]];
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "{\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 crossProd = cross(delta"+deltaName2+", delta"+deltaName1+");\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float lengthCross = max(length(crossProd), 1e-6f);\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 deltaCross0 = -cross(delta"+deltaName1+", crossProd)*dEdAngle"+cu.intToString(index)+"/(delta"+deltaName1+".w*lengthCross);\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 deltaCross2 = cross(delta"+deltaName2+", crossProd)*dEdAngle"+cu.intToString(index)+"/(delta"+deltaName2+".w*lengthCross);\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 deltaCross1 = -(deltaCross0+deltaCross2);\n");
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[0], "deltaCross0.xyz");
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[1], "deltaCross1.xyz");
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[2], "deltaCross2.xyz");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "}\n");
-//    }
-//    index = 0;
-//    for (map<string, vector<int> >::const_iterator iter = dihedrals.begin(); iter != dihedrals.end(); ++iter, ++index) {
-//        const vector<int>& atoms = iter->second;
-//        string deltaName1 = atomNames[atoms[0]]+atomNames[atoms[1]];
-//        string deltaName2 = atomNames[atoms[2]]+atomNames[atoms[1]];
-//        string deltaName3 = atomNames[atoms[2]]+atomNames[atoms[3]];
-//        string crossName1 = "cross_"+deltaName1+"_"+deltaName2;
-//        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "{\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float r = sqrt(delta"+deltaName2+".w);\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 ff;\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "ff.x = (-dEdDihedral"+cu.intToString(index)+"*r)/"+crossName1+".w;\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "ff.y = (delta"+deltaName1+".x*delta"+deltaName2+".x + delta"+deltaName1+".y*delta"+deltaName2+".y + delta"+deltaName1+".z*delta"+deltaName2+".z)/delta"+deltaName2+".w;\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "ff.z = (delta"+deltaName3+".x*delta"+deltaName2+".x + delta"+deltaName3+".y*delta"+deltaName2+".y + delta"+deltaName3+".z*delta"+deltaName2+".z)/delta"+deltaName2+".w;\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "ff.w = (dEdDihedral"+cu.intToString(index)+"*r)/"+crossName2+".w;\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 internalF0 = ff.x*"+crossName1+";\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 internalF3 = ff.w*"+crossName2+";\n");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "float4 s = ff.y*internalF0 - ff.z*internalF3;\n");
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[0], "internalF0.xyz");
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[1], "s.xyz-internalF0.xyz");
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[2], "-s.xyz-internalF3.xyz");
-//        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[3], "internalF3.xyz");
-//        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "}\n");
-//    }
-//
-//    // Generate the kernels.
-//
-//    map<string, string> replacements;
-//    replacements["COMPUTE_DONOR_FORCE"] = computeDonor.str();
-//    replacements["COMPUTE_ACCEPTOR_FORCE"] = computeAcceptor.str();
-//    replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
-//    map<string, string> defines;
-//    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//    defines["NUM_DONORS"] = cu.intToString(numDonors);
-//    defines["NUM_ACCEPTORS"] = cu.intToString(numAcceptors);
-//    defines["M_PI"] = cu.doubleToString(M_PI);
-//    if (force.getNonbondedMethod() != CustomHbondForce::NoCutoff) {
-//        defines["USE_CUTOFF"] = "1";
-//        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
-//    }
-//    if (force.getNonbondedMethod() != CustomHbondForce::NoCutoff && force.getNonbondedMethod() != CustomHbondForce::CutoffNonPeriodic)
-//        defines["USE_PERIODIC"] = "1";
-//    if (force.getNumExclusions() > 0)
-//        defines["USE_EXCLUSIONS"] = "1";
-//    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customHbondForce, replacements), defines);
-//    donorKernel = cu.getKernel(module, "computeDonorForces");
-//    acceptorKernel = cu.getKernel(module, "computeAcceptorForces");
-//}
-//
-//double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-//    if (numDonors == 0 || numAcceptors == 0)
-//        return 0.0;
-//    if (globals != NULL) {
-//        bool changed = false;
-//        for (int i = 0; i < (int) globalParamNames.size(); i++) {
-//            cl_float value = (cl_float) context.getParameter(globalParamNames[i]);
-//            if (value != globalParamValues[i])
-//                changed = true;
-//            globalParamValues[i] = value;
-//        }
-//        if (changed)
-//            globals->upload(globalParamValues);
-//    }
-//    if (!hasInitializedKernel) {
-//        hasInitializedKernel = true;
-//        int index = 0;
-//        donorKernel.setArg<cu::Buffer>(index++, cu.getForceBuffers().getDevicePointer());
-//        donorKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
-//        donorKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
-//        donorKernel.setArg<cu::Buffer>(index++, donorExclusions->getDevicePointer());
-//        donorKernel.setArg<cu::Buffer>(index++, donors->getDevicePointer());
-//        donorKernel.setArg<cu::Buffer>(index++, acceptors->getDevicePointer());
-//        donorKernel.setArg<cu::Buffer>(index++, donorBufferIndices->getDevicePointer());
-//        donorKernel.setArg(index++, 3*CudaContext::ThreadBlockSize*sizeof(mm_float4), NULL);
-//        index += 2; // Periodic box size arguments are set when the kernel is executed.
-//        if (globals != NULL)
-//            donorKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
-//        for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
-//            donorKernel.setArg<cu::Memory>(index++, buffer.getMemory());
-//        }
-//        for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
-//            donorKernel.setArg<cu::Memory>(index++, buffer.getMemory());
-//        }
-//        if (tabulatedFunctionParams != NULL) {
-//            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//                donorKernel.setArg<cu::Buffer>(index++, tabulatedFunctions[i]->getDevicePointer());
-//            donorKernel.setArg<cu::Buffer>(index++, tabulatedFunctionParams->getDevicePointer());
-//        }
-//        index = 0;
-//        acceptorKernel.setArg<cu::Buffer>(index++, cu.getForceBuffers().getDevicePointer());
-//        acceptorKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
-//        acceptorKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
-//        acceptorKernel.setArg<cu::Buffer>(index++, acceptorExclusions->getDevicePointer());
-//        acceptorKernel.setArg<cu::Buffer>(index++, donors->getDevicePointer());
-//        acceptorKernel.setArg<cu::Buffer>(index++, acceptors->getDevicePointer());
-//        acceptorKernel.setArg<cu::Buffer>(index++, acceptorBufferIndices->getDevicePointer());
-//        acceptorKernel.setArg(index++, 3*CudaContext::ThreadBlockSize*sizeof(mm_float4), NULL);
-//        index += 2; // Periodic box size arguments are set when the kernel is executed.
-//        if (globals != NULL)
-//            acceptorKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
-//        for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
-//            acceptorKernel.setArg<cu::Memory>(index++, buffer.getMemory());
-//        }
-//        for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
-//            acceptorKernel.setArg<cu::Memory>(index++, buffer.getMemory());
-//        }
-//        if (tabulatedFunctionParams != NULL) {
-//            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//                acceptorKernel.setArg<cu::Buffer>(index++, tabulatedFunctions[i]->getDevicePointer());
-//            acceptorKernel.setArg<cu::Buffer>(index++, tabulatedFunctionParams->getDevicePointer());
-//        }
-//    }
-//    donorKernel.setArg<mm_float4>(8, cu.getPeriodicBoxSize());
-//    donorKernel.setArg<mm_float4>(9, cu.getInvPeriodicBoxSize());
-//    cu.executeKernel(donorKernel, max(numDonors, numAcceptors));
-//    acceptorKernel.setArg<mm_float4>(8, cu.getPeriodicBoxSize());
-//    acceptorKernel.setArg<mm_float4>(9, cu.getInvPeriodicBoxSize());
-//    cu.executeKernel(acceptorKernel, max(numDonors, numAcceptors));
-//    return 0.0;
-//}
-//
-//void CudaCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
-//    cu.setAsCurrent();
-//    int numContexts = cu.getPlatformData().contexts.size();
-//    int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts;
-//    int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts;
-//    if (numDonors != endIndex-startIndex)
-//        throw OpenMMException("updateParametersInContext: The number of donors has changed");
-//    if (numAcceptors != force.getNumAcceptors())
-//        throw OpenMMException("updateParametersInContext: The number of acceptors has changed");
-//    
-//    // Record the per-donor parameters.
-//    
-//    vector<vector<cl_float> > donorParamVector(numDonors);
-//    vector<double> parameters;
-//    for (int i = 0; i < numDonors; i++) {
-//        int d1, d2, d3;
-//        force.getDonorParameters(startIndex+i, d1, d2, d3, parameters);
-//        donorParamVector[i].resize(parameters.size());
-//        for (int j = 0; j < (int) parameters.size(); j++)
-//            donorParamVector[i][j] = (cl_float) parameters[j];
-//    }
-//    donorParams->setParameterValues(donorParamVector);
-//    
-//    // Record the per-acceptor parameters.
-//    
-//    vector<vector<cl_float> > acceptorParamVector(numAcceptors);
-//    for (int i = 0; i < numAcceptors; i++) {
-//        int a1, a2, a3;
-//        force.getAcceptorParameters(i, a1, a2, a3, parameters);
-//        acceptorParamVector[i].resize(parameters.size());
-//        for (int j = 0; j < (int) parameters.size(); j++)
-//            acceptorParamVector[i][j] = (cl_float) parameters[j];
-//    }
-//    acceptorParams->setParameterValues(acceptorParamVector);
-//    
-//    // Mark that the current reordering may be invalid.
-//    
-//    cu.invalidateMolecules();
-//}
+class CudaCustomHbondForceInfo : public CudaForceInfo {
+public:
+    CudaCustomHbondForceInfo(const CustomHbondForce& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        return true;
+    }
+    int getNumParticleGroups() {
+        return force.getNumDonors()+force.getNumAcceptors()+force.getNumExclusions();
+    }
+    void getParticlesInGroup(int index, vector<int>& particles) {
+        int p1, p2, p3;
+        vector<double> parameters;
+        if (index < force.getNumDonors()) {
+            force.getDonorParameters(index, p1, p2, p3, parameters);
+            particles.clear();
+            particles.push_back(p1);
+            if (p2 > -1)
+                particles.push_back(p2);
+            if (p3 > -1)
+                particles.push_back(p3);
+            return;
+        }
+        index -= force.getNumDonors();
+        if (index < force.getNumAcceptors()) {
+            force.getAcceptorParameters(index, p1, p2, p3, parameters);
+            particles.clear();
+            particles.push_back(p1);
+            if (p2 > -1)
+                particles.push_back(p2);
+            if (p3 > -1)
+                particles.push_back(p3);
+            return;
+        }
+        index -= force.getNumAcceptors();
+        int donor, acceptor;
+        force.getExclusionParticles(index, donor, acceptor);
+        particles.clear();
+        force.getDonorParameters(donor, p1, p2, p3, parameters);
+        particles.push_back(p1);
+        if (p2 > -1)
+            particles.push_back(p2);
+        if (p3 > -1)
+            particles.push_back(p3);
+        force.getAcceptorParameters(acceptor, p1, p2, p3, parameters);
+        particles.push_back(p1);
+        if (p2 > -1)
+            particles.push_back(p2);
+        if (p3 > -1)
+            particles.push_back(p3);
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        int p1, p2, p3;
+        vector<double> params1, params2;
+        if (group1 < force.getNumDonors() && group2 < force.getNumDonors()) {
+            force.getDonorParameters(group1, p1, p2, p3, params1);
+            force.getDonorParameters(group2, p1, p2, p3, params2);
+            return (params1 == params2 && params1 == params2);
+        }
+        if (group1 < force.getNumDonors() || group2 < force.getNumDonors())
+            return false;
+        group1 -= force.getNumDonors();
+        group2 -= force.getNumDonors();
+        if (group1 < force.getNumAcceptors() && group2 < force.getNumAcceptors()) {
+            force.getAcceptorParameters(group1, p1, p2, p3, params1);
+            force.getAcceptorParameters(group2, p1, p2, p3, params2);
+            return (params1 == params2 && params1 == params2);
+        }
+        if (group1 < force.getNumAcceptors() || group2 < force.getNumAcceptors())
+            return false;
+        return true;
+    }
+private:
+    const CustomHbondForce& force;
+};
+
+CudaCalcCustomHbondForceKernel::~CudaCalcCustomHbondForceKernel() {
+    cu.setAsCurrent();
+    if (donorParams != NULL)
+        delete donorParams;
+    if (acceptorParams != NULL)
+        delete acceptorParams;
+    if (donors != NULL)
+        delete donors;
+    if (acceptors != NULL)
+        delete acceptors;
+    if (globals != NULL)
+        delete globals;
+    if (donorExclusions != NULL)
+        delete donorExclusions;
+    if (acceptorExclusions != NULL)
+        delete acceptorExclusions;
+    if (tabulatedFunctionParams != NULL)
+        delete tabulatedFunctionParams;
+    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+        delete tabulatedFunctions[i];
+}
+
+static void addDonorAndAcceptorCode(stringstream& computeDonor, stringstream& computeAcceptor, const string& value) {
+    computeDonor << value;
+    computeAcceptor << value;
+}
+
+static void applyDonorAndAcceptorForces(stringstream& applyToDonor, stringstream& applyToAcceptor, int atom, const string& value) {
+    string forceNames[] = {"f1", "f2", "f3"};
+    if (atom < 3)
+        applyToAcceptor << forceNames[atom]<<" += trim("<<value<<");\n";
+    else
+        applyToDonor << forceNames[atom-3]<<" += trim("<<value<<");\n";
+}
+
+void CudaCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
+    // Record the lists of donors and acceptors, and the parameters for each one.
+
+    cu.setAsCurrent();
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts;
+    int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts;
+    numDonors = endIndex-startIndex;
+    numAcceptors = force.getNumAcceptors();
+    if (numDonors == 0 || numAcceptors == 0)
+        return;
+    int numParticles = system.getNumParticles();
+    donors = CudaArray::create<int4>(cu, numDonors, "customHbondDonors");
+    acceptors = CudaArray::create<int4>(cu, numAcceptors, "customHbondAcceptors");
+    donorParams = new CudaParameterSet(cu, force.getNumPerDonorParameters(), numDonors, "customHbondDonorParameters");
+    acceptorParams = new CudaParameterSet(cu, force.getNumPerAcceptorParameters(), numAcceptors, "customHbondAcceptorParameters");
+    if (force.getNumGlobalParameters() > 0)
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customHbondGlobals");
+    vector<vector<float> > donorParamVector(numDonors);
+    vector<int4> donorVector(numDonors);
+    for (int i = 0; i < numDonors; i++) {
+        vector<double> parameters;
+        force.getDonorParameters(startIndex+i, donorVector[i].x, donorVector[i].y, donorVector[i].z, parameters);
+        donorParamVector[i].resize(parameters.size());
+        for (int j = 0; j < (int) parameters.size(); j++)
+            donorParamVector[i][j] = (float) parameters[j];
+    }
+    donors->upload(donorVector);
+    donorParams->setParameterValues(donorParamVector);
+    vector<vector<float> > acceptorParamVector(numAcceptors);
+    vector<int4> acceptorVector(numAcceptors);
+    for (int i = 0; i < numAcceptors; i++) {
+        vector<double> parameters;
+        force.getAcceptorParameters(i, acceptorVector[i].x, acceptorVector[i].y, acceptorVector[i].z, parameters);
+        acceptorParamVector[i].resize(parameters.size());
+        for (int j = 0; j < (int) parameters.size(); j++)
+            acceptorParamVector[i][j] = (float) parameters[j];
+    }
+    acceptors->upload(acceptorVector);
+    acceptorParams->setParameterValues(acceptorParamVector);
+    cu.addForce(new CudaCustomHbondForceInfo(force));
+
+    // Record exclusions.
+
+    vector<int4> donorExclusionVector(numDonors, make_int4(-1, -1, -1, -1));
+    vector<int4> acceptorExclusionVector(numAcceptors, make_int4(-1, -1, -1, -1));
+    for (int i = 0; i < force.getNumExclusions(); i++) {
+        int donor, acceptor;
+        force.getExclusionParticles(i, donor, acceptor);
+        if (donor < startIndex || donor >= endIndex)
+            continue;
+        donor -= startIndex;
+        if (donorExclusionVector[donor].x == -1)
+            donorExclusionVector[donor].x = acceptor;
+        else if (donorExclusionVector[donor].y == -1)
+            donorExclusionVector[donor].y = acceptor;
+        else if (donorExclusionVector[donor].z == -1)
+            donorExclusionVector[donor].z = acceptor;
+        else if (donorExclusionVector[donor].w == -1)
+            donorExclusionVector[donor].w = acceptor;
+        else
+            throw OpenMMException("CustomHbondForce: CudaPlatform does not support more than four exclusions per donor");
+        if (acceptorExclusionVector[acceptor].x == -1)
+            acceptorExclusionVector[acceptor].x = donor;
+        else if (acceptorExclusionVector[acceptor].y == -1)
+            acceptorExclusionVector[acceptor].y = donor;
+        else if (acceptorExclusionVector[acceptor].z == -1)
+            acceptorExclusionVector[acceptor].z = donor;
+        else if (acceptorExclusionVector[acceptor].w == -1)
+            acceptorExclusionVector[acceptor].w = donor;
+        else
+            throw OpenMMException("CustomHbondForce: CudaPlatform does not support more than four exclusions per acceptor");
+    }
+    donorExclusions = CudaArray::create<int4>(cu, numDonors, "customHbondDonorExclusions");
+    acceptorExclusions = CudaArray::create<int4>(cu, numAcceptors, "customHbondAcceptorExclusions");
+    donorExclusions->upload(donorExclusionVector);
+    acceptorExclusions->upload(acceptorExclusionVector);
+
+    // Record the tabulated functions.
+
+    CudaExpressionUtilities::FunctionPlaceholder fp;
+    map<string, Lepton::CustomFunction*> functions;
+    vector<pair<string, string> > functionDefinitions;
+    vector<float4> tabulatedFunctionParamsVec(force.getNumFunctions());
+    stringstream tableArgs;
+    for (int i = 0; i < force.getNumFunctions(); i++) {
+        string name;
+        vector<double> values;
+        double min, max;
+        force.getFunctionParameters(i, name, values, min, max);
+        string arrayName = "table"+cu.intToString(i);
+        functionDefinitions.push_back(make_pair(name, arrayName));
+        functions[name] = &fp;
+        tabulatedFunctionParamsVec[i] = make_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
+        vector<float4> f = cu.getExpressionUtilities().computeFunctionCoefficients(values, min, max);
+        tabulatedFunctions.push_back(CudaArray::create<float4>(cu, values.size()-1, "TabulatedFunction"));
+        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
+        tableArgs << ", const float4* __restrict__ " << arrayName;
+    }
+    if (force.getNumFunctions() > 0) {
+        tabulatedFunctionParams = CudaArray::create<float4>(cu, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters");
+        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
+        tableArgs << ", const float4* __restrict__ functionParams";
+    }
+
+    // Record information about parameters.
+
+    globalParamNames.resize(force.getNumGlobalParameters());
+    globalParamValues.resize(force.getNumGlobalParameters());
+    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        globalParamNames[i] = force.getGlobalParameterName(i);
+        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
+    }
+    if (globals != NULL)
+        globals->upload(globalParamValues);
+    map<string, string> variables;
+    for (int i = 0; i < force.getNumPerDonorParameters(); i++) {
+        const string& name = force.getPerDonorParameterName(i);
+        variables[name] = "donorParams"+donorParams->getParameterSuffix(i);
+    }
+    for (int i = 0; i < force.getNumPerAcceptorParameters(); i++) {
+        const string& name = force.getPerAcceptorParameterName(i);
+        variables[name] = "acceptorParams"+acceptorParams->getParameterSuffix(i);
+    }
+    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        const string& name = force.getGlobalParameterName(i);
+        variables[name] = "globals["+cu.intToString(i)+"]";
+    }
+
+    // Now to generate the kernel.  First, it needs to calculate all distances, angles,
+    // and dihedrals the expression depends on.
+
+    map<string, vector<int> > distances;
+    map<string, vector<int> > angles;
+    map<string, vector<int> > dihedrals;
+    Lepton::ParsedExpression energyExpression = CustomHbondForceImpl::prepareExpression(force, functions, distances, angles, dihedrals);
+    map<string, Lepton::ParsedExpression> forceExpressions;
+    set<string> computedDeltas;
+    computedDeltas.insert("D1A1");
+    string atomNames[] = {"A1", "A2", "A3", "D1", "D2", "D3"};
+    string atomNamesLower[] = {"a1", "a2", "a3", "d1", "d2", "d3"};
+    stringstream computeDonor, computeAcceptor, extraArgs;
+    int index = 0;
+    for (map<string, vector<int> >::const_iterator iter = distances.begin(); iter != distances.end(); ++iter, ++index) {
+        const vector<int>& atoms = iter->second;
+        string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]];
+        if (computedDeltas.count(deltaName) == 0) {
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+");\n");
+            computedDeltas.insert(deltaName);
+        }
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real r_"+deltaName+" = SQRT(delta"+deltaName+".w);\n");
+        variables[iter->first] = "r_"+deltaName;
+        forceExpressions["real dEdDistance"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = angles.begin(); iter != angles.end(); ++iter, ++index) {
+        const vector<int>& atoms = iter->second;
+        string deltaName1 = atomNames[atoms[1]]+atomNames[atoms[0]];
+        string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]];
+        string angleName = "angle_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]];
+        if (computedDeltas.count(deltaName1) == 0) {
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[0]]+");\n");
+            computedDeltas.insert(deltaName1);
+        }
+        if (computedDeltas.count(deltaName2) == 0) {
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[2]]+");\n");
+            computedDeltas.insert(deltaName2);
+        }
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real "+angleName+" = computeAngle(delta"+deltaName1+", delta"+deltaName2+");\n");
+        variables[iter->first] = angleName;
+        forceExpressions["real dEdAngle"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = dihedrals.begin(); iter != dihedrals.end(); ++iter, ++index) {
+        const vector<int>& atoms = iter->second;
+        string deltaName1 = atomNames[atoms[0]]+atomNames[atoms[1]];
+        string deltaName2 = atomNames[atoms[2]]+atomNames[atoms[1]];
+        string deltaName3 = atomNames[atoms[2]]+atomNames[atoms[3]];
+        string crossName1 = "cross_"+deltaName1+"_"+deltaName2;
+        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
+        string dihedralName = "dihedral_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]]+atomNames[atoms[3]];
+        if (computedDeltas.count(deltaName1) == 0) {
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+");\n");
+            computedDeltas.insert(deltaName1);
+        }
+        if (computedDeltas.count(deltaName2) == 0) {
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[1]]+");\n");
+            computedDeltas.insert(deltaName2);
+        }
+        if (computedDeltas.count(deltaName3) == 0) {
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName3+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[3]]+");\n");
+            computedDeltas.insert(deltaName3);
+        }
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 "+crossName1+" = computeCross(delta"+deltaName1+", delta"+deltaName2+");\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 "+crossName2+" = computeCross(delta"+deltaName2+", delta"+deltaName3+");\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real "+dihedralName+" = computeAngle("+crossName1+", "+crossName2+");\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, dihedralName+" *= (delta"+deltaName1+".x*"+crossName2+".x + delta"+deltaName1+".y*"+crossName2+".y + delta"+deltaName1+".z*"+crossName2+".z < 0 ? -1 : 1);\n");
+        variables[iter->first] = dihedralName;
+        forceExpressions["real dEdDihedral"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
+    }
+
+    // Next it needs to load parameters from global memory.
+
+    if (force.getNumGlobalParameters() > 0)
+        extraArgs << ", const float* __restrict__ globals";
+    for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
+        const CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
+        extraArgs << ", const "+buffer.getType()+"* __restrict__ donor"+buffer.getName();
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" donorParams"+cu.intToString(i+1)+" = donor"+buffer.getName()+"[index];\n");
+    }
+    for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
+        const CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
+        extraArgs << ", const "+buffer.getType()+"* __restrict__ acceptor"+buffer.getName();
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" acceptorParams"+cu.intToString(i+1)+" = acceptor"+buffer.getName()+"[index];\n");
+    }
+
+    // Now evaluate the expressions.
+
+    computeAcceptor << cu.getExpressionUtilities().createExpressions(forceExpressions, variables, functionDefinitions, "temp", "functionParams");
+    forceExpressions["energy += "] = energyExpression;
+    computeDonor << cu.getExpressionUtilities().createExpressions(forceExpressions, variables, functionDefinitions, "temp", "functionParams");
+
+    // Finally, apply forces to atoms.
+
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = distances.begin(); iter != distances.end(); ++iter, ++index) {
+        const vector<int>& atoms = iter->second;
+        string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]];
+        string value = "(dEdDistance"+cu.intToString(index)+"/r_"+deltaName+")*delta"+deltaName;
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[0], "-"+value);
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[1], value);
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = angles.begin(); iter != angles.end(); ++iter, ++index) {
+        const vector<int>& atoms = iter->second;
+        string deltaName1 = atomNames[atoms[1]]+atomNames[atoms[0]];
+        string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]];
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "{\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real3 crossProd = cross(delta"+deltaName2+", delta"+deltaName1+");\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real lengthCross = max(SQRT(dot(crossProd,crossProd)), 1e-6f);\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real3 deltaCross0 = -cross(trim(delta"+deltaName1+"), crossProd)*dEdAngle"+cu.intToString(index)+"/(delta"+deltaName1+".w*lengthCross);\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real3 deltaCross2 = cross(trim(delta"+deltaName2+"), crossProd)*dEdAngle"+cu.intToString(index)+"/(delta"+deltaName2+".w*lengthCross);\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real3 deltaCross1 = -(deltaCross0+deltaCross2);\n");
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[0], "deltaCross0");
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[1], "deltaCross1");
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[2], "deltaCross2");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "}\n");
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = dihedrals.begin(); iter != dihedrals.end(); ++iter, ++index) {
+        const vector<int>& atoms = iter->second;
+        string deltaName1 = atomNames[atoms[0]]+atomNames[atoms[1]];
+        string deltaName2 = atomNames[atoms[2]]+atomNames[atoms[1]];
+        string deltaName3 = atomNames[atoms[2]]+atomNames[atoms[3]];
+        string crossName1 = "cross_"+deltaName1+"_"+deltaName2;
+        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "{\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real r = SQRT(delta"+deltaName2+".w);\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 ff;\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "ff.x = (-dEdDihedral"+cu.intToString(index)+"*r)/"+crossName1+".w;\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "ff.y = (delta"+deltaName1+".x*delta"+deltaName2+".x + delta"+deltaName1+".y*delta"+deltaName2+".y + delta"+deltaName1+".z*delta"+deltaName2+".z)/delta"+deltaName2+".w;\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "ff.z = (delta"+deltaName3+".x*delta"+deltaName2+".x + delta"+deltaName3+".y*delta"+deltaName2+".y + delta"+deltaName3+".z*delta"+deltaName2+".z)/delta"+deltaName2+".w;\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "ff.w = (dEdDihedral"+cu.intToString(index)+"*r)/"+crossName2+".w;\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 internalF0 = ff.x*"+crossName1+";\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 internalF3 = ff.w*"+crossName2+";\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 s = ff.y*internalF0 - ff.z*internalF3;\n");
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[0], "internalF0");
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[1], "s-internalF0");
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[2], "-s-internalF3");
+        applyDonorAndAcceptorForces(computeDonor, computeAcceptor, atoms[3], "internalF3");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "}\n");
+    }
+
+    // Generate the kernels.
+
+    map<string, string> replacements;
+    replacements["COMPUTE_DONOR_FORCE"] = computeDonor.str();
+    replacements["COMPUTE_ACCEPTOR_FORCE"] = computeAcceptor.str();
+    replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
+    map<string, string> defines;
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    defines["NUM_DONORS"] = cu.intToString(numDonors);
+    defines["NUM_ACCEPTORS"] = cu.intToString(numAcceptors);
+    defines["M_PI"] = cu.doubleToString(M_PI);
+    if (force.getNonbondedMethod() != CustomHbondForce::NoCutoff) {
+        defines["USE_CUTOFF"] = "1";
+        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+    }
+    if (force.getNonbondedMethod() != CustomHbondForce::NoCutoff && force.getNonbondedMethod() != CustomHbondForce::CutoffNonPeriodic)
+        defines["USE_PERIODIC"] = "1";
+    if (force.getNumExclusions() > 0)
+        defines["USE_EXCLUSIONS"] = "1";
+    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customHbondForce, replacements), defines);
+    donorKernel = cu.getKernel(module, "computeDonorForces");
+    acceptorKernel = cu.getKernel(module, "computeAcceptorForces");
+}
+
+double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (numDonors == 0 || numAcceptors == 0)
+        return 0.0;
+    if (globals != NULL) {
+        bool changed = false;
+        for (int i = 0; i < (int) globalParamNames.size(); i++) {
+            float value = (float) context.getParameter(globalParamNames[i]);
+            if (value != globalParamValues[i])
+                changed = true;
+            globalParamValues[i] = value;
+        }
+        if (changed)
+            globals->upload(globalParamValues);
+    }
+    if (!hasInitializedKernel) {
+        hasInitializedKernel = true;
+        int index = 0;
+        donorArgs.push_back(&cu.getForce().getDevicePointer());
+        donorArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
+        donorArgs.push_back(&cu.getPosq().getDevicePointer());
+        donorArgs.push_back(&donorExclusions->getDevicePointer());
+        donorArgs.push_back(&donors->getDevicePointer());
+        donorArgs.push_back(&acceptors->getDevicePointer());
+        donorArgs.push_back(cu.getPeriodicBoxSizePointer());
+        donorArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+        if (globals != NULL)
+            donorArgs.push_back(&globals->getDevicePointer());
+        for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
+            CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
+            donorArgs.push_back(&buffer.getMemory());
+        }
+        for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
+            CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
+            donorArgs.push_back(&buffer.getMemory());
+        }
+        if (tabulatedFunctionParams != NULL) {
+            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+                donorArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
+            donorArgs.push_back(&tabulatedFunctionParams->getDevicePointer());
+        }
+        index = 0;
+        acceptorArgs.push_back(&cu.getForce().getDevicePointer());
+        acceptorArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
+        acceptorArgs.push_back(&cu.getPosq().getDevicePointer());
+        acceptorArgs.push_back(&acceptorExclusions->getDevicePointer());
+        acceptorArgs.push_back(&donors->getDevicePointer());
+        acceptorArgs.push_back(&acceptors->getDevicePointer());
+        acceptorArgs.push_back(cu.getPeriodicBoxSizePointer());
+        acceptorArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+        if (globals != NULL)
+            acceptorArgs.push_back(&globals->getDevicePointer());
+        for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
+            CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
+            acceptorArgs.push_back(&buffer.getMemory());
+        }
+        for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
+            CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
+            acceptorArgs.push_back(&buffer.getMemory());
+        }
+        if (tabulatedFunctionParams != NULL) {
+            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+                acceptorArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
+            acceptorArgs.push_back(&tabulatedFunctionParams->getDevicePointer());
+        }
+    }
+    int sharedMemorySize = 3*CudaContext::ThreadBlockSize*sizeof(float4);
+    cu.executeKernel(donorKernel, &donorArgs[0], max(numDonors, numAcceptors), CudaContext::ThreadBlockSize, sharedMemorySize);
+    cu.executeKernel(acceptorKernel, &acceptorArgs[0], max(numDonors, numAcceptors), CudaContext::ThreadBlockSize, sharedMemorySize);
+    return 0.0;
+}
+
+void CudaCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
+    cu.setAsCurrent();
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts;
+    int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts;
+    if (numDonors != endIndex-startIndex)
+        throw OpenMMException("updateParametersInContext: The number of donors has changed");
+    if (numAcceptors != force.getNumAcceptors())
+        throw OpenMMException("updateParametersInContext: The number of acceptors has changed");
+    
+    // Record the per-donor parameters.
+    
+    vector<vector<float> > donorParamVector(numDonors);
+    vector<double> parameters;
+    for (int i = 0; i < numDonors; i++) {
+        int d1, d2, d3;
+        force.getDonorParameters(startIndex+i, d1, d2, d3, parameters);
+        donorParamVector[i].resize(parameters.size());
+        for (int j = 0; j < (int) parameters.size(); j++)
+            donorParamVector[i][j] = (float) parameters[j];
+    }
+    donorParams->setParameterValues(donorParamVector);
+    
+    // Record the per-acceptor parameters.
+    
+    vector<vector<float> > acceptorParamVector(numAcceptors);
+    for (int i = 0; i < numAcceptors; i++) {
+        int a1, a2, a3;
+        force.getAcceptorParameters(i, a1, a2, a3, parameters);
+        acceptorParamVector[i].resize(parameters.size());
+        for (int j = 0; j < (int) parameters.size(); j++)
+            acceptorParamVector[i][j] = (float) parameters[j];
+    }
+    acceptorParams->setParameterValues(acceptorParamVector);
+    
+    // Mark that the current reordering may be invalid.
+    
+    cu.invalidateMolecules();
+}

 class CudaCustomCompoundBondForceInfo : public CudaForceInfo {
 public:
@@ -4288,679 +4255,784 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons

    // Update the time and step count.

-    double dt, time;
+    double dt, time;
+    if (cu.getUseDoublePrecision()) {
+        double2 stepSize;
+        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
+        dt = stepSize.y;
+        time = cu.getTime()+dt;
+        if (dt == maxStepSize)
+            time = maxTime; // Avoid round-off error
+    }
+    else {
+        float2 stepSize;
+        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
+        dt = stepSize.y;
+        time = cu.getTime()+dt;
+        if (dt == maxStepSizeFloat)
+            time = maxTime; // Avoid round-off error
+    }
+    cu.setTime(time);
+    cu.setStepCount(cu.getStepCount()+1);
+    return dt;
+}
+
+CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() {
+    cu.setAsCurrent();
+    if (params != NULL)
+        delete params;
+}
+
+void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
+    cu.setAsCurrent();
+    cu.getPlatformData().initializeContexts(system);
+    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
+    map<string, string> defines;
+    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
+    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
+    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
+    selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize");
+    params = CudaArray::create<float>(cu, 3, "langevinParams");
+    blockSize = min(256, system.getNumParticles());
+    blockSize = max(blockSize, params->getSize());
+}
+
+double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+    int numAtoms = cu.getNumAtoms();
+
+    // Select the step size to use.
+
+    double maxStepSize = maxTime-cu.getTime();
+    float maxStepSizeFloat = (float) maxStepSize;
+    double tol = integrator.getErrorTolerance();
+    float tolFloat = (float) tol;
+    double tau = integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction();
+    float tauFloat = (float) tau;
+    double kT = BOLTZ*integrator.getTemperature();
+    float kTFloat = (float) kT;
+    void* argsSelect[] = {cu.getUseDoublePrecision() ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
+            cu.getUseDoublePrecision() ? (void*) &tol : (void*) &tolFloat,
+            cu.getUseDoublePrecision() ? (void*) &tau : (void*) &tauFloat,
+            cu.getUseDoublePrecision() ? (void*) &kT : (void*) &kTFloat,
+            &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
+            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &params->getDevicePointer()};
+    int sharedSize = blockSize*(cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize);
+
+    // Call the first integration kernel.
+
+    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
+    void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+            &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
+    cu.executeKernel(kernel1, args1, numAtoms);
+
+    // Apply constraints.
+
+    integration.applyConstraints(integrator.getConstraintTolerance());
+
+    // Call the second integration kernel.
+
+    void* args2[] = {&cu.getPosq().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+            &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
+    cu.executeKernel(kernel2, args2, numAtoms);
+    integration.computeVirtualSites();
+
+    // Update the time and step count.
+
+    double dt, time;
+    if (cu.getUseDoublePrecision()) {
+        double2 stepSize;
+        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
+        dt = stepSize.y;
+        time = cu.getTime()+dt;
+        if (dt == maxStepSize)
+            time = maxTime; // Avoid round-off error
+    }
+    else {
+        float2 stepSize;
+        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
+        dt = stepSize.y;
+        time = cu.getTime()+dt;
+        if (dt == maxStepSizeFloat)
+            time = maxTime; // Avoid round-off error
+    }
+    cu.setTime(time);
+    cu.setStepCount(cu.getStepCount()+1);
+    return dt;
+}
+
+class CudaIntegrateCustomStepKernel::ReorderListener : public CudaContext::ReorderListener {
+public:
+    ReorderListener(CudaContext& cu, CudaParameterSet& perDofValues, vector<vector<float> >& localPerDofValuesFloat, vector<vector<double> >& localPerDofValuesDouble, bool& deviceValuesAreCurrent) :
+            cu(cu), perDofValues(perDofValues), localPerDofValuesFloat(localPerDofValuesFloat), localPerDofValuesDouble(localPerDofValuesDouble), deviceValuesAreCurrent(deviceValuesAreCurrent) {
+        int numAtoms = cu.getNumAtoms();
+        lastAtomOrder.resize(numAtoms);
+        for (int i = 0; i < numAtoms; i++)
+            lastAtomOrder[i] = cu.getAtomIndex()[i];
+    }
+    void execute() {
+        // Reorder the per-DOF variables to reflect the new atom order.
+
+        if (perDofValues.getNumParameters() == 0)
+            return;
+        int numAtoms = cu.getNumAtoms();
+        const vector<int>& order = cu.getAtomIndex();
+        if (cu.getUseDoublePrecision()) {
+            if (deviceValuesAreCurrent)
+                perDofValues.getParameterValues(localPerDofValuesDouble);
+            vector<vector<double> > swap(3*numAtoms);
+            for (int i = 0; i < numAtoms; i++) {
+                swap[3*lastAtomOrder[i]] = localPerDofValuesDouble[3*i];
+                swap[3*lastAtomOrder[i]+1] = localPerDofValuesDouble[3*i+1];
+                swap[3*lastAtomOrder[i]+2] = localPerDofValuesDouble[3*i+2];
+            }
+            for (int i = 0; i < numAtoms; i++) {
+                localPerDofValuesDouble[3*i] = swap[3*order[i]];
+                localPerDofValuesDouble[3*i+1] = swap[3*order[i]+1];
+                localPerDofValuesDouble[3*i+2] = swap[3*order[i]+2];
+            }
+            perDofValues.setParameterValues(localPerDofValuesDouble);
+        }
+        else {
+            if (deviceValuesAreCurrent)
+                perDofValues.getParameterValues(localPerDofValuesFloat);
+            vector<vector<float> > swap(3*numAtoms);
+            for (int i = 0; i < numAtoms; i++) {
+                swap[3*lastAtomOrder[i]] = localPerDofValuesFloat[3*i];
+                swap[3*lastAtomOrder[i]+1] = localPerDofValuesFloat[3*i+1];
+                swap[3*lastAtomOrder[i]+2] = localPerDofValuesFloat[3*i+2];
+            }
+            for (int i = 0; i < numAtoms; i++) {
+                localPerDofValuesFloat[3*i] = swap[3*order[i]];
+                localPerDofValuesFloat[3*i+1] = swap[3*order[i]+1];
+                localPerDofValuesFloat[3*i+2] = swap[3*order[i]+2];
+            }
+            perDofValues.setParameterValues(localPerDofValuesFloat);
+        }
+        for (int i = 0; i < numAtoms; i++)
+            lastAtomOrder[i] = order[i];
+        deviceValuesAreCurrent = true;
+    }
+private:
+    CudaContext& cu;
+    CudaParameterSet& perDofValues;
+    vector<vector<float> >& localPerDofValuesFloat;
+    vector<vector<double> >& localPerDofValuesDouble;
+    bool& deviceValuesAreCurrent;
+    vector<int> lastAtomOrder;
+};
+
+CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
+    cu.setAsCurrent();
+    if (globalValues != NULL)
+        delete globalValues;
+    if (contextParameterValues != NULL)
+        delete contextParameterValues;
+    if (sumBuffer != NULL)
+        delete sumBuffer;
+    if (energy != NULL)
+        delete energy;
+    if (uniformRandoms != NULL)
+        delete uniformRandoms;
+    if (randomSeed != NULL)
+        delete randomSeed;
+    if (perDofValues != NULL)
+        delete perDofValues;
+}
+
+void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
+    cu.setAsCurrent();
+    cu.getPlatformData().initializeContexts(system);
+    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
+    numGlobalVariables = integrator.getNumGlobalVariables();
+    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    globalValues = new CudaArray(cu, max(1, numGlobalVariables), elementSize, "globalVariables");
+    sumBuffer = new CudaArray(cu, 3*system.getNumParticles(), elementSize, "sumBuffer");
+    energy = new CudaArray(cu, 1, elementSize, "energy");
+    perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision());
+    cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
+    prevStepSize = -1.0;
+    SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
+}
+
+string CudaIntegrateCustomStepKernel::createGlobalComputation(const string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const string& energyName) {
+    map<string, Lepton::ParsedExpression> expressions;
+    if (variable == "dt")
+        expressions["dt[0].y = "] = expr;
+    else {
+        for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
+            if (variable == integrator.getGlobalVariableName(i))
+                expressions["globals["+cu.intToString(i)+"] = "] = expr;
+        for (int i = 0; i < (int) parameterNames.size(); i++)
+            if (variable == parameterNames[i]) {
+                expressions["params["+cu.intToString(i)+"] = "] = expr;
+                modifiesParameters = true;
+            }
+    }
+    if (expressions.size() == 0)
+        throw OpenMMException("Unknown global variable: "+variable);
+    map<string, string> variables;
+    variables["dt"] = "dt[0].y";
+    variables["uniform"] = "uniform";
+    variables["gaussian"] = "gaussian";
+    variables[energyName] = "energy[0]";
+    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
+        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(i)+"]";
+    for (int i = 0; i < (int) parameterNames.size(); i++)
+        variables[parameterNames[i]] = "params["+cu.intToString(i)+"]";
+    vector<pair<string, string> > functions;
+    return cu.getExpressionUtilities().createExpressions(expressions, variables, functions, "temp", "");
+}
+
+string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const string& forceName, const string& energyName) {
+    const string suffixes[] = {".x", ".y", ".z"};
+    string suffix = suffixes[component];
+    map<string, Lepton::ParsedExpression> expressions;
+    if (variable == "x")
+        expressions["position"+suffix+" = "] = expr;
+    else if (variable == "v")
+        expressions["velocity"+suffix+" = "] = expr;
+    else if (variable == "")
+        expressions["sum[3*index+"+cu.intToString(component)+"] = "] = expr;
+    else {
+        for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
+            if (variable == integrator.getPerDofVariableName(i))
+                expressions["perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i)+" = "] = expr;
+    }
+    if (expressions.size() == 0)
+        throw OpenMMException("Unknown per-DOF variable: "+variable);
+    map<string, string> variables;
+    variables["x"] = "position"+suffix;
+    variables["v"] = "velocity"+suffix;
+    variables[forceName] = "f"+suffix;
+    variables["gaussian"] = "gaussian"+suffix;
+    variables["uniform"] = "uniform"+suffix;
+    variables["m"] = "mass";
+    variables["dt"] = "stepSize";
+    variables[energyName] = "energy[0]";
+    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
+        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(i)+"]";
+    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
+        variables[integrator.getPerDofVariableName(i)] = "perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i);
+    for (int i = 0; i < (int) parameterNames.size(); i++)
+        variables[parameterNames[i]] = "params["+cu.intToString(i)+"]";
+    vector<pair<string, string> > functions;
+    return cu.getExpressionUtilities().createExpressions(expressions, variables, functions, "temp"+cu.intToString(component)+"_", "", "double");
+}
+
+void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+    int numAtoms = cu.getNumAtoms();
+    int numSteps = integrator.getNumComputations();
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+        
+        // Initialize various data structures.
+        
+        const map<string, double>& params = context.getParameters();
+        if (cu.getUseDoublePrecision()) {
+            contextParameterValues = CudaArray::create<double>(cu, max(1, (int) params.size()), "contextParameters");
+            contextValuesDouble.resize(contextParameterValues->getSize());
+            for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
+                contextValuesDouble[parameterNames.size()] = iter->second;
+                parameterNames.push_back(iter->first);
+            }
+            contextParameterValues->upload(contextValuesDouble);
+        }
+        else {
+            contextParameterValues = CudaArray::create<float>(cu, max(1, (int) params.size()), "contextParameters");
+            contextValuesFloat.resize(contextParameterValues->getSize());
+            for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
+                contextValuesFloat[parameterNames.size()] = (float) iter->second;
+                parameterNames.push_back(iter->first);
+            }
+            contextParameterValues->upload(contextValuesFloat);
+        }
+        kernels.resize(integrator.getNumComputations());
+        kernelArgs.resize(integrator.getNumComputations());
+        requiredGaussian.resize(integrator.getNumComputations(), 0);
+        requiredUniform.resize(integrator.getNumComputations(), 0);
+        needsForces.resize(numSteps, false);
+        needsEnergy.resize(numSteps, false);
+        forceGroup.resize(numSteps, -2);
+        invalidatesForces.resize(numSteps, false);
+        merged.resize(numSteps, false);
+        modifiesParameters = false;
+        map<string, string> defines;
+        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+        defines["WORK_GROUP_SIZE"] = cu.intToString(CudaContext::ThreadBlockSize);
+        defines["SUM_BUFFER_SIZE"] = "0";
+        defines["SUM_OUTPUT_INDEX"] = "0";
+        
+        // Initialize the random number generator.
+        
+        uniformRandoms = CudaArray::create<float4>(cu, cu.getNumAtoms(), "uniformRandoms");
+        randomSeed = CudaArray::create<int4>(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
+        vector<int4> seed(randomSeed->getSize());
+        unsigned int r = integrator.getRandomNumberSeed()+1;
+        for (int i = 0; i < randomSeed->getSize(); i++) {
+            seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+            seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+            seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+            seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        }
+        randomSeed->upload(seed);
+        CUmodule randomProgram = cu.createModule(CudaKernelSources::customIntegrator, defines);
+        randomKernel = cu.getKernel(randomProgram, "generateRandomNumbers");
+        
+        // Build a list of all variables that affect the forces, so we can tell which
+        // steps invalidate them.
+        
+        set<string> affectsForce;
+        affectsForce.insert("x");
+        for (vector<ForceImpl*>::const_iterator iter = context.getForceImpls().begin(); iter != context.getForceImpls().end(); ++iter) {
+            const map<string, double> params = (*iter)->getDefaultParameters();
+            for (map<string, double>::const_iterator param = params.begin(); param != params.end(); ++param)
+                affectsForce.insert(param->first);
+        }
+        
+        // Record information about all the computation steps.
+        
+        stepType.resize(numSteps);
+        vector<string> variable(numSteps);
+        vector<Lepton::ParsedExpression> expression(numSteps);
+        vector<string> forceGroupName;
+        vector<string> energyGroupName;
+        for (int i = 0; i < 32; i++) {
+            stringstream fname;
+            fname << "f" << i;
+            forceGroupName.push_back(fname.str());
+            stringstream ename;
+            ename << "energy" << i;
+            energyGroupName.push_back(ename.str());
+        }
+        vector<string> forceName(numSteps, "f");
+        vector<string> energyName(numSteps, "energy");
+        for (int step = 0; step < numSteps; step++) {
+            string expr;
+            integrator.getComputationStep(step, stepType[step], variable[step], expr);
+            if (expr.size() > 0) {
+                expression[step] = Lepton::Parser::parse(expr).optimize();
+                if (usesVariable(expression[step], "f")) {
+                    needsForces[step] = true;
+                    forceGroup[step] = -1;
+                }
+                if (usesVariable(expression[step], "energy")) {
+                    needsEnergy[step] = true;
+                    forceGroup[step] = -1;
+                }
+                for (int i = 0; i < 32; i++) {
+                    if (usesVariable(expression[step], forceGroupName[i])) {
+                        if (forceGroup[step] != -2)
+                            throw OpenMMException("A single computation step cannot depend on multiple force groups");
+                        needsForces[step] = true;
+                        forceGroup[step] = 1<<i;
+                        forceName[step] = forceGroupName[i];
+                    }
+                    if (usesVariable(expression[step], energyGroupName[i])) {
+                        if (forceGroup[step] != -2)
+                            throw OpenMMException("A single computation step cannot depend on multiple force groups");
+                        needsEnergy[step] = true;
+                        forceGroup[step] = 1<<i;
+                        energyName[step] = energyGroupName[i];
+                    }
+                }
+            }
+            invalidatesForces[step] = (stepType[step] == CustomIntegrator::ConstrainPositions || affectsForce.find(variable[step]) != affectsForce.end());
+            if (forceGroup[step] == -2 && step > 0)
+                forceGroup[step] = forceGroup[step-1];
+        }
+        
+        // Determine how each step will represent the position (as just a value, or a value plus a delta).
+        
+        vector<bool> storePosAsDelta(numSteps, false);
+        vector<bool> loadPosAsDelta(numSteps, false);
+        bool beforeConstrain = false;
+        for (int step = numSteps-1; step >= 0; step--) {
+            if (stepType[step] == CustomIntegrator::ConstrainPositions)
+                beforeConstrain = true;
+            else if (stepType[step] == CustomIntegrator::ComputePerDof && variable[step] == "x" && beforeConstrain)
+                storePosAsDelta[step] = true;
+        }
+        bool storedAsDelta = false;
+        for (int step = 0; step < numSteps; step++) {
+            loadPosAsDelta[step] = storedAsDelta;
+            if (storePosAsDelta[step] == true)
+                storedAsDelta = true;
+            if (stepType[step] == CustomIntegrator::ConstrainPositions)
+                storedAsDelta = false;
+        }
+        
+        // Identify steps that can be merged into a single kernel.
+        
+        for (int step = 1; step < numSteps; step++) {
+            if (needsForces[step] || needsEnergy[step])
+                continue;
+            if (stepType[step-1] == CustomIntegrator::ComputeGlobal && stepType[step] == CustomIntegrator::ComputeGlobal)
+                merged[step] = true;
+            if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof &&
+                    !usesVariable(expression[step], "uniform"))
+                merged[step] = true;
+        }
+        
+        // Loop over all steps and create the kernels for them.
+        
+        for (int step = 0; step < numSteps; step++) {
+            if ((stepType[step] == CustomIntegrator::ComputePerDof || stepType[step] == CustomIntegrator::ComputeSum) && !merged[step]) {
+                // Compute a per-DOF value.
+                
+                stringstream compute;
+                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
+                    CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
+                    compute << buffer.getType()<<" perDofx"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index];\n";
+                    compute << buffer.getType()<<" perDofy"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index+1];\n";
+                    compute << buffer.getType()<<" perDofz"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index+2];\n";
+                }
+                int numGaussian = 0, numUniform = 0;
+                for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
+                    compute << "{\n";
+                    for (int i = 0; i < 3; i++)
+                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]);
+                    if (variable[j] == "x") {
+                        if (storePosAsDelta[j])
+                            compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(posq[index]));\n";
+                        else
+                            compute << "posq[index] = convertFromDouble4(position);\n";
+                    }
+                    else if (variable[j] == "v")
+                        compute << "velm[index] = convertFromDouble4(velocity);\n";
+                    else {
+                        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
+                            CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
+                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index] = perDofx"<<cu.intToString(i+1)<<";\n";
+                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index+1] = perDofy"<<cu.intToString(i+1)<<";\n";
+                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index+2] = perDofz"<<cu.intToString(i+1)<<";\n";
+                        }
+                    }
+                    compute << "}\n";
+                    numGaussian += numAtoms*usesVariable(expression[j], "gaussian");
+                    numUniform += numAtoms*usesVariable(expression[j], "uniform");
+                }
+                map<string, string> replacements;
+                replacements["COMPUTE_STEP"] = compute.str();
+                stringstream args;
+                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
+                    CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
+                    string valueName = "perDofValues"+cu.intToString(i+1);
+                    args << ", " << buffer.getType() << "* __restrict__ " << valueName;
+                }
+                replacements["PARAMETER_ARGUMENTS"] = args.str();
+                if (loadPosAsDelta[step])
+                    defines["LOAD_POS_AS_DELTA"] = "1";
+                else if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
+                    defines.erase("LOAD_POS_AS_DELTA");
+                CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customIntegratorPerDof, replacements), defines);
+                CUfunction kernel = cu.getKernel(module, "computePerDof");
+                kernels[step].push_back(kernel);
+                requiredGaussian[step] = numGaussian;
+                requiredUniform[step] = numUniform;
+                vector<void*> args1;
+                args1.push_back(&cu.getPosq().getDevicePointer());
+                args1.push_back(&integration.getPosDelta().getDevicePointer());
+                args1.push_back(&cu.getVelm().getDevicePointer());
+                args1.push_back(&cu.getForce().getDevicePointer());
+                args1.push_back(&integration.getStepSize().getDevicePointer());
+                args1.push_back(&globalValues->getDevicePointer());
+                args1.push_back(&contextParameterValues->getDevicePointer());
+                args1.push_back(&sumBuffer->getDevicePointer());
+                args1.push_back(&integration.getRandom().getDevicePointer());
+                args1.push_back(NULL);
+                args1.push_back(&uniformRandoms->getDevicePointer());
+                args1.push_back(&energy->getDevicePointer());
+                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
+                    args1.push_back(&perDofValues->getBuffers()[i].getMemory());
+                kernelArgs[step].push_back(args1);
+                if (stepType[step] == CustomIntegrator::ComputeSum) {
+                    // Create a second kernel for this step that sums the values.
+
+                    vector<void*> args2;
+                    args2.push_back(&sumBuffer->getDevicePointer());
+                    bool found = false;
+                    for (int j = 0; j < integrator.getNumGlobalVariables() && !found; j++)
+                        if (variable[step] == integrator.getGlobalVariableName(j)) {
+                            args2.push_back(&globalValues->getDevicePointer());
+                            defines["SUM_OUTPUT_INDEX"] = cu.intToString(j);
+                            found = true;
+                        }
+                    for (int j = 0; j < (int) parameterNames.size() && !found; j++)
+                        if (variable[step] == parameterNames[j]) {
+                            args2.push_back(&contextParameterValues->getDevicePointer());
+                            defines["SUM_OUTPUT_INDEX"] = cu.intToString(j);
+                            found = true;
+                            modifiesParameters = true;
+                        }
+                    if (!found)
+                        throw OpenMMException("Unknown global variable: "+variable[step]);
+                    defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
+                    module = cu.createModule(CudaKernelSources::customIntegrator, defines);
+                    kernel = cu.getKernel(module, "computeSum");
+                    kernels[step].push_back(kernel);
+                    kernelArgs[step].push_back(args2);
+                }
+            }
+            else if (stepType[step] == CustomIntegrator::ComputeGlobal && !merged[step]) {
+                // Compute a global value.
+
+                stringstream compute;
+                for (int i = step; i < numSteps && (i == step || merged[i]); i++)
+                    compute << "{\n" << createGlobalComputation(variable[i], expression[i], integrator, energyName[i]) << "}\n";
+                map<string, string> replacements;
+                replacements["COMPUTE_STEP"] = compute.str();
+                CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorGlobal, replacements), defines);
+                CUfunction kernel = cu.getKernel(module, "computeGlobal");
+                kernels[step].push_back(kernel);
+                vector<void*> args;
+                args.push_back(&integration.getStepSize().getDevicePointer());
+                args.push_back(&globalValues->getDevicePointer());
+                args.push_back(&contextParameterValues->getDevicePointer());
+                args.push_back(NULL);
+                args.push_back(NULL);
+                args.push_back(&energy->getDevicePointer());
+                kernelArgs[step].push_back(args);
+            }
+            else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
+                // Apply position constraints.
+
+                CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
+                CUfunction kernel = cu.getKernel(module, "applyPositionDeltas");
+                kernels[step].push_back(kernel);
+                vector<void*> args;
+                args.push_back(&cu.getPosq().getDevicePointer());
+                args.push_back(&integration.getPosDelta().getDevicePointer());
+                kernelArgs[step].push_back(args);
+            }
+        }
+        
+        // Create the kernel for summing energy.
+
+        defines["SUM_OUTPUT_INDEX"] = "0";
+        defines["SUM_BUFFER_SIZE"] = cu.intToString(cu.getEnergyBuffer().getSize());
+        CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
+        sumEnergyKernel = cu.getKernel(module, "computeSum");
+    }
+    
+    // Make sure all values (variables, parameters, etc.) stored on the device are up to date.
+    
+    if (!deviceValuesAreCurrent) {
+        if (cu.getUseDoublePrecision())
+            perDofValues->setParameterValues(localPerDofValuesDouble);
+        else
+            perDofValues->setParameterValues(localPerDofValuesFloat);
+        deviceValuesAreCurrent = true;
+    }
+    localValuesAreCurrent = false;
+    double stepSize = integrator.getStepSize();
+    if (stepSize != prevStepSize) {
+        if (cu.getUseDoublePrecision()) {
+            double size[] = {0, stepSize};
+            integration.getStepSize().upload(size);
+        }
+        else {
+            float size[] = {0, (float) stepSize};
+            integration.getStepSize().upload(size);
+        }
+        prevStepSize = stepSize;
+    }
+    bool paramsChanged = false;
    if (cu.getUseDoublePrecision()) {
-        double2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
-        if (dt == maxStepSize)
-            time = maxTime; // Avoid round-off error
+        for (int i = 0; i < (int) parameterNames.size(); i++) {
+            double value = context.getParameter(parameterNames[i]);
+            if (value != contextValuesDouble[i]) {
+                contextValuesDouble[i] = value;
+                paramsChanged = true;
+            }
+        }
+        if (paramsChanged)
+            contextParameterValues->upload(contextValuesDouble);
    }
    else {
-        float2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
-        if (dt == maxStepSizeFloat)
-            time = maxTime; // Avoid round-off error
+        for (int i = 0; i < (int) parameterNames.size(); i++) {
+            float value = (float) context.getParameter(parameterNames[i]);
+            if (value != contextValuesFloat[i]) {
+                contextValuesFloat[i] = value;
+                paramsChanged = true;
+            }
+        }
+        if (paramsChanged)
+            contextParameterValues->upload(contextValuesFloat);
+    }
+
+    // Loop over computation steps in the integrator and execute them.
+
+    void* randomArgs[] = {&uniformRandoms->getDevicePointer(), &randomSeed->getDevicePointer()};
+    for (int i = 0; i < numSteps; i++) {
+        if ((needsForces[i] || needsEnergy[i]) && (!forcesAreValid || context.getLastForceGroups() != forceGroup[i])) {
+            // Recompute forces and/or energy.  Figure out what is actually needed
+            // between now and the next time they get invalidated again.
+            
+            bool computeForce = false, computeEnergy = false;
+            for (int j = i; ; j++) {
+                if (needsForces[j])
+                    computeForce = true;
+                if (needsEnergy[j])
+                    computeEnergy = true;
+                if (invalidatesForces[j])
+                    break;
+                if (j == numSteps-1)
+                    j = -1;
+                if (j == i-1)
+                    break;
+            }
+            recordChangedParameters(context);
+            context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
+            if (computeEnergy) {
+                void* args[] = {&cu.getEnergyBuffer().getDevicePointer(), &energy->getDevicePointer()};
+                cu.executeKernel(sumEnergyKernel, &args[0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
+            }
+            forcesAreValid = true;
+        }
+        if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
+            int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
+            kernelArgs[i][0][9] = &randomIndex;
+            if (requiredUniform[i] > 0)
+                cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
+            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
+        }
+        else if (stepType[i] == CustomIntegrator::ComputeGlobal && !merged[i]) {
+            float uniform = SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber();
+            float gauss = SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+            kernelArgs[i][0][3] = &uniform;
+            kernelArgs[i][0][4] = &gauss;
+            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], 1, 1);
+        }
+        else if (stepType[i] == CustomIntegrator::ComputeSum) {
+            int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
+            kernelArgs[i][0][9] = &randomIndex;
+            if (requiredUniform[i] > 0)
+                cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
+            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
+            cu.executeKernel(kernels[i][1], &kernelArgs[i][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
+        }
+        else if (stepType[i] == CustomIntegrator::UpdateContextState) {
+            recordChangedParameters(context);
+            context.updateContextState();
+        }
+        else if (stepType[i] == CustomIntegrator::ConstrainPositions) {
+            cu.getIntegrationUtilities().applyConstraints(integrator.getConstraintTolerance());
+            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
+            cu.getIntegrationUtilities().computeVirtualSites();
+        }
+        else if (stepType[i] == CustomIntegrator::ConstrainVelocities) {
+            cu.getIntegrationUtilities().applyVelocityConstraints(integrator.getConstraintTolerance());
+        }
+        if (invalidatesForces[i])
+            forcesAreValid = false;
    }
-    cu.setTime(time);
+    recordChangedParameters(context);
+
+    // Update the time and step count.
+
+    cu.setTime(cu.getTime()+stepSize);
    cu.setStepCount(cu.getStepCount()+1);
-    return dt;
 }

-CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() {
-    cu.setAsCurrent();
-    if (params != NULL)
-        delete params;
+void CudaIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
+    if (!modifiesParameters)
+        return;
+    if (cu.getUseDoublePrecision()) {
+        contextParameterValues->download(contextValuesDouble);
+        for (int i = 0; i < (int) parameterNames.size(); i++) {
+            double value = context.getParameter(parameterNames[i]);
+            if (value != contextValuesDouble[i])
+                context.setParameter(parameterNames[i], contextValuesDouble[i]);
+        }
+    }
+    else {
+        contextParameterValues->download(contextValuesFloat);
+        for (int i = 0; i < (int) parameterNames.size(); i++) {
+            float value = (float) context.getParameter(parameterNames[i]);
+            if (value != contextValuesFloat[i])
+                context.setParameter(parameterNames[i], contextValuesFloat[i]);
+        }
+    }
 }

-void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
-    cu.setAsCurrent();
-    cu.getPlatformData().initializeContexts(system);
-    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
-    map<string, string> defines;
-    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
-    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
-    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
-    selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize");
-    params = CudaArray::create<float>(cu, 3, "langevinParams");
-    blockSize = min(256, system.getNumParticles());
-    blockSize = max(blockSize, params->getSize());
+void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vector<double>& values) const {
+    values.resize(numGlobalVariables);
+    if (numGlobalVariables == 0)
+        return;
+    if (cu.getUseDoublePrecision())
+        globalValues->download(values);
+    else {
+        vector<float> buffer;
+        globalValues->download(buffer);
+        for (int i = 0; i < numGlobalVariables; i++)
+            values[i] = buffer[i];
+    }
 }

-double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
-    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
-    int numAtoms = cu.getNumAtoms();
-
-    // Select the step size to use.
-
-    double maxStepSize = maxTime-cu.getTime();
-    float maxStepSizeFloat = (float) maxStepSize;
-    double tol = integrator.getErrorTolerance();
-    float tolFloat = (float) tol;
-    double tau = integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction();
-    float tauFloat = (float) tau;
-    double kT = BOLTZ*integrator.getTemperature();
-    float kTFloat = (float) kT;
-    void* argsSelect[] = {cu.getUseDoublePrecision() ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
-            cu.getUseDoublePrecision() ? (void*) &tol : (void*) &tolFloat,
-            cu.getUseDoublePrecision() ? (void*) &tau : (void*) &tauFloat,
-            cu.getUseDoublePrecision() ? (void*) &kT : (void*) &kTFloat,
-            &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
-            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &params->getDevicePointer()};
-    int sharedSize = blockSize*(cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-    cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize);
-
-    // Call the first integration kernel.
-
-    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
-    void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
-            &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
-    cu.executeKernel(kernel1, args1, numAtoms);
-
-    // Apply constraints.
-
-    integration.applyConstraints(integrator.getConstraintTolerance());
-
-    // Call the second integration kernel.
-
-    void* args2[] = {&cu.getPosq().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
-            &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
-    cu.executeKernel(kernel2, args2, numAtoms);
-    integration.computeVirtualSites();
-
-    // Update the time and step count.
+void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
+    if (numGlobalVariables == 0)
+        return;
+    if (cu.getUseDoublePrecision())
+        globalValues->upload(values);
+    else {
+        vector<float> buffer(numGlobalVariables);
+        for (int i = 0; i < numGlobalVariables; i++)
+            buffer[i] = (float) values[i];
+        globalValues->upload(buffer);
+    }
+}

-    double dt, time;
+void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
+    values.resize(perDofValues->getNumObjects()/3);
+    const vector<int>& order = cu.getAtomIndex();
    if (cu.getUseDoublePrecision()) {
-        double2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
-        if (dt == maxStepSize)
-            time = maxTime; // Avoid round-off error
+        if (!localValuesAreCurrent) {
+            perDofValues->getParameterValues(localPerDofValuesDouble);
+            localValuesAreCurrent = true;
+        }
+        for (int i = 0; i < (int) values.size(); i++)
+            for (int j = 0; j < 3; j++)
+                values[order[i]][j] = localPerDofValuesDouble[3*i+j][variable];
    }
    else {
-        float2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
-        if (dt == maxStepSizeFloat)
-            time = maxTime; // Avoid round-off error
+        if (!localValuesAreCurrent) {
+            perDofValues->getParameterValues(localPerDofValuesFloat);
+            localValuesAreCurrent = true;
+        }
+        for (int i = 0; i < (int) values.size(); i++)
+            for (int j = 0; j < 3; j++)
+                values[order[i]][j] = localPerDofValuesFloat[3*i+j][variable];
    }
-    cu.setTime(time);
-    cu.setStepCount(cu.getStepCount()+1);
-    return dt;
 }

-//class CudaIntegrateCustomStepKernel::ReorderListener : public CudaContext::ReorderListener {
-//public:
-//    ReorderListener(CudaContext& cu, CudaParameterSet& perDofValues, vector<vector<cl_float> >& localPerDofValues, bool& deviceValuesAreCurrent) :
-//            cu(cu), perDofValues(perDofValues), localPerDofValues(localPerDofValues), deviceValuesAreCurrent(deviceValuesAreCurrent) {
-//        int numAtoms = cu.getNumAtoms();
-//        lastAtomOrder.resize(numAtoms);
-//        for (int i = 0; i < numAtoms; i++)
-//            lastAtomOrder[i] = cu.getAtomIndex()[i];
-//    }
-//    void execute() {
-//        // Reorder the per-DOF variables to reflect the new atom order.
-//
-//        if (perDofValues.getNumParameters() == 0)
-//            return;
-//        int numAtoms = cu.getNumAtoms();
-//        if (deviceValuesAreCurrent)
-//            perDofValues.getParameterValues(localPerDofValues);
-//        vector<vector<cl_float> > swap(3*numAtoms);
-//        for (int i = 0; i < numAtoms; i++) {
-//            swap[3*lastAtomOrder[i]] = localPerDofValues[3*i];
-//            swap[3*lastAtomOrder[i]+1] = localPerDofValues[3*i+1];
-//            swap[3*lastAtomOrder[i]+2] = localPerDofValues[3*i+2];
-//        }
-//        CudaArray<cl_int>& order = cu.getAtomIndex();
-//        for (int i = 0; i < numAtoms; i++) {
-//            localPerDofValues[3*i] = swap[3*order[i]];
-//            localPerDofValues[3*i+1] = swap[3*order[i]+1];
-//            localPerDofValues[3*i+2] = swap[3*order[i]+2];
-//        }
-//        perDofValues.setParameterValues(localPerDofValues);
-//        for (int i = 0; i < numAtoms; i++)
-//            lastAtomOrder[i] = order[i];
-//        deviceValuesAreCurrent = true;
-//    }
-//private:
-//    CudaContext& cu;
-//    CudaParameterSet& perDofValues;
-//    vector<vector<cl_float> >& localPerDofValues;
-//    bool& deviceValuesAreCurrent;
-//    vector<int> lastAtomOrder;
-//};
-//
-//CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
-//    cu.setAsCurrent();
-//    if (globalValues != NULL)
-//        delete globalValues;
-//    if (contextParameterValues != NULL)
-//        delete contextParameterValues;
-//    if (sumBuffer != NULL)
-//        delete sumBuffer;
-//    if (energy != NULL)
-//        delete energy;
-//    if (uniformRandoms != NULL)
-//        delete uniformRandoms;
-//    if (randomSeed != NULL)
-//        delete randomSeed;
-//    if (perDofValues != NULL)
-//        delete perDofValues;
-//}
-//
-//void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
-//    cu.setAsCurrent();
-//    cu.getPlatformData().initializeContexts(system);
-//    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
-//    numGlobalVariables = integrator.getNumGlobalVariables();
-//    globalValues = new CudaArray<cl_float>(cu, max(1, numGlobalVariables), "globalVariables", true);
-//    sumBuffer = new CudaArray<cl_float>(cu, 3*system.getNumParticles(), "sumBuffer");
-//    energy = new CudaArray<cl_float>(cu, 1, "energy");
-//    perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables");
-//    cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValues, deviceValuesAreCurrent));
-//    prevStepSize = -1.0;
-//    SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
-//}
-//
-//string CudaIntegrateCustomStepKernel::createGlobalComputation(const string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const string& energyName) {
-//    map<string, Lepton::ParsedExpression> expressions;
-//    if (variable == "dt")
-//        expressions["dt[0].y = "] = expr;
-//    else {
-//        for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
-//            if (variable == integrator.getGlobalVariableName(i))
-//                expressions["globals["+cu.intToString(i)+"] = "] = expr;
-//        for (int i = 0; i < (int) parameterNames.size(); i++)
-//            if (variable == parameterNames[i]) {
-//                expressions["params["+cu.intToString(i)+"] = "] = expr;
-//                modifiesParameters = true;
-//            }
-//    }
-//    if (expressions.size() == 0)
-//        throw OpenMMException("Unknown global variable: "+variable);
-//    map<string, string> variables;
-//    variables["dt"] = "dt[0].y";
-//    variables["uniform"] = "uniform";
-//    variables["gaussian"] = "gaussian";
-//    variables[energyName] = "energy[0]";
-//    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
-//        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(i)+"]";
-//    for (int i = 0; i < (int) parameterNames.size(); i++)
-//        variables[parameterNames[i]] = "params["+cu.intToString(i)+"]";
-//    vector<pair<string, string> > functions;
-//    return cu.getExpressionUtilities().createExpressions(expressions, variables, functions, "temp", "");
-//}
-//
-//string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const string& forceName, const string& energyName) {
-//    const string suffixes[] = {".x", ".y", ".z"};
-//    string suffix = suffixes[component];
-//    map<string, Lepton::ParsedExpression> expressions;
-//    if (variable == "x")
-//        expressions["position"+suffix+" = "] = expr;
-//    else if (variable == "v")
-//        expressions["velocity"+suffix+" = "] = expr;
-//    else if (variable == "")
-//        expressions["sum[3*index+"+cu.intToString(component)+"] = "] = expr;
-//    else {
-//        for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
-//            if (variable == integrator.getPerDofVariableName(i))
-//                expressions["perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i)+" = "] = expr;
-//    }
-//    if (expressions.size() == 0)
-//        throw OpenMMException("Unknown per-DOF variable: "+variable);
-//    map<string, string> variables;
-//    variables["x"] = "position"+suffix;
-//    variables["v"] = "velocity"+suffix;
-//    variables[forceName] = "f"+suffix;
-//    variables["gaussian"] = "gaussian"+suffix;
-//    variables["uniform"] = "uniform"+suffix;
-//    variables["m"] = "mass";
-//    variables["dt"] = "stepSize";
-//    variables[energyName] = "energy[0]";
-//    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
-//        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(i)+"]";
-//    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
-//        variables[integrator.getPerDofVariableName(i)] = "perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i);
-//    for (int i = 0; i < (int) parameterNames.size(); i++)
-//        variables[parameterNames[i]] = "params["+cu.intToString(i)+"]";
-//    vector<pair<string, string> > functions;
-//    string tempType = (cu.getSupportsDoublePrecision() ? "double" : "float");
-//    return cu.getExpressionUtilities().createExpressions(expressions, variables, functions, "temp"+cu.intToString(component)+"_", "", tempType);
-//}
-//
-//void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {
-//    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
-//    int numAtoms = cu.getNumAtoms();
-//    int numSteps = integrator.getNumComputations();
-//    if (!hasInitializedKernels) {
-//        hasInitializedKernels = true;
-//        
-//        // Initialize various data structures.
-//        
-//        const map<string, double>& params = context.getParameters();
-//        contextParameterValues = new CudaArray<cl_float>(cu, max(1, (int) params.size()), "contextParameters", true);
-//        for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
-//            contextParameterValues->set(parameterNames.size(), (float) iter->second);
-//            parameterNames.push_back(iter->first);
-//        }
-//        contextParameterValues->upload();
-//        kernels.resize(integrator.getNumComputations());
-//        requiredGaussian.resize(integrator.getNumComputations(), 0);
-//        requiredUniform.resize(integrator.getNumComputations(), 0);
-//        needsForces.resize(numSteps, false);
-//        needsEnergy.resize(numSteps, false);
-//        forceGroup.resize(numSteps, -2);
-//        invalidatesForces.resize(numSteps, false);
-//        merged.resize(numSteps, false);
-//        modifiesParameters = false;
-//        map<string, string> defines;
-//        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//        defines["WORK_GROUP_SIZE"] = cu.intToString(CudaContext::ThreadBlockSize);
-//        
-//        // Initialize the random number generator.
-//        
-//        uniformRandoms = new CudaArray<mm_float4>(cu, cu.getNumAtoms(), "uniformRandoms");
-//        randomSeed = new CudaArray<mm_int4>(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
-//        vector<mm_int4> seed(randomSeed->getSize());
-//        unsigned int r = integrator.getRandomNumberSeed()+1;
-//        for (int i = 0; i < randomSeed->getSize(); i++) {
-//            seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-//            seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-//            seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-//            seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-//        }
-//        randomSeed->upload(seed);
-//        CUmodule randomProgram = cu.createModule(CudaKernelSources::customIntegrator, defines);
-//        randomKernel = cu.getKernel(randomProgram, "generateRandomNumbers");
-//        randomKernel.setArg<cu::Buffer>(0, uniformRandoms->getDevicePointer());
-//        randomKernel.setArg<cu::Buffer>(1, randomSeed->getDevicePointer());
-//        
-//        // Build a list of all variables that affect the forces, so we can tell which
-//        // steps invalidate them.
-//        
-//        set<string> affectsForce;
-//        affectsForce.insert("x");
-//        for (vector<ForceImpl*>::const_iterator iter = context.getForceImpls().begin(); iter != context.getForceImpls().end(); ++iter) {
-//            const map<string, double> params = (*iter)->getDefaultParameters();
-//            for (map<string, double>::const_iterator param = params.begin(); param != params.end(); ++param)
-//                affectsForce.insert(param->first);
-//        }
-//        
-//        // Record information about all the computation steps.
-//        
-//        stepType.resize(numSteps);
-//        vector<string> variable(numSteps);
-//        vector<Lepton::ParsedExpression> expression(numSteps);
-//        vector<string> forceGroupName;
-//        vector<string> energyGroupName;
-//        for (int i = 0; i < 32; i++) {
-//            stringstream fname;
-//            fname << "f" << i;
-//            forceGroupName.push_back(fname.str());
-//            stringstream ename;
-//            ename << "energy" << i;
-//            energyGroupName.push_back(ename.str());
-//        }
-//        vector<string> forceName(numSteps, "f");
-//        vector<string> energyName(numSteps, "energy");
-//        for (int step = 0; step < numSteps; step++) {
-//            string expr;
-//            integrator.getComputationStep(step, stepType[step], variable[step], expr);
-//            if (expr.size() > 0) {
-//                expression[step] = Lepton::Parser::parse(expr).optimize();
-//                if (usesVariable(expression[step], "f")) {
-//                    needsForces[step] = true;
-//                    forceGroup[step] = -1;
-//                }
-//                if (usesVariable(expression[step], "energy")) {
-//                    needsEnergy[step] = true;
-//                    forceGroup[step] = -1;
-//                }
-//                for (int i = 0; i < 32; i++) {
-//                    if (usesVariable(expression[step], forceGroupName[i])) {
-//                        if (forceGroup[step] != -2)
-//                            throw OpenMMException("A single computation step cannot depend on multiple force groups");
-//                        needsForces[step] = true;
-//                        forceGroup[step] = 1<<i;
-//                        forceName[step] = forceGroupName[i];
-//                    }
-//                    if (usesVariable(expression[step], energyGroupName[i])) {
-//                        if (forceGroup[step] != -2)
-//                            throw OpenMMException("A single computation step cannot depend on multiple force groups");
-//                        needsEnergy[step] = true;
-//                        forceGroup[step] = 1<<i;
-//                        energyName[step] = energyGroupName[i];
-//                    }
-//                }
-//            }
-//            invalidatesForces[step] = (stepType[step] == CustomIntegrator::ConstrainPositions || affectsForce.find(variable[step]) != affectsForce.end());
-//            if (forceGroup[step] == -2 && step > 0)
-//                forceGroup[step] = forceGroup[step-1];
-//        }
-//        
-//        // Determine how each step will represent the position (as just a value, or a value plus a delta).
-//        
-//        vector<bool> storePosAsDelta(numSteps, false);
-//        vector<bool> loadPosAsDelta(numSteps, false);
-//        bool beforeConstrain = false;
-//        for (int step = numSteps-1; step >= 0; step--) {
-//            if (stepType[step] == CustomIntegrator::ConstrainPositions)
-//                beforeConstrain = true;
-//            else if (stepType[step] == CustomIntegrator::ComputePerDof && variable[step] == "x" && beforeConstrain)
-//                storePosAsDelta[step] = true;
-//        }
-//        bool storedAsDelta = false;
-//        for (int step = 0; step < numSteps; step++) {
-//            loadPosAsDelta[step] = storedAsDelta;
-//            if (storePosAsDelta[step] == true)
-//                storedAsDelta = true;
-//            if (stepType[step] == CustomIntegrator::ConstrainPositions)
-//                storedAsDelta = false;
-//        }
-//        
-//        // Identify steps that can be merged into a single kernel.
-//        
-//        for (int step = 1; step < numSteps; step++) {
-//            if (needsForces[step] || needsEnergy[step])
-//                continue;
-//            if (stepType[step-1] == CustomIntegrator::ComputeGlobal && stepType[step] == CustomIntegrator::ComputeGlobal)
-//                merged[step] = true;
-//            if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof &&
-//                    !usesVariable(expression[step], "uniform"))
-//                merged[step] = true;
-//        }
-//        
-//        // Loop over all steps and create the kernels for them.
-//        
-//        for (int step = 0; step < numSteps; step++) {
-//            if ((stepType[step] == CustomIntegrator::ComputePerDof || stepType[step] == CustomIntegrator::ComputeSum) && !merged[step]) {
-//                // Compute a per-DOF value.
-//                
-//                stringstream compute;
-//                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
-//                    const CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
-//                    compute << buffer.getType()<<" perDofx"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index];\n";
-//                    compute << buffer.getType()<<" perDofy"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index+1];\n";
-//                    compute << buffer.getType()<<" perDofz"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index+2];\n";
-//                }
-//                string convert = (cu.getSupportsDoublePrecision() ? "convert_float4(" : "(");
-//                int numGaussian = 0, numUniform = 0;
-//                for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
-//                    compute << "{\n";
-//                    for (int i = 0; i < 3; i++)
-//                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]);
-//                    if (variable[j] == "x") {
-//                        if (storePosAsDelta[j]) {
-//                            if (cu.getSupportsDoublePrecision())
-//                                compute << "posDelta[index] = convert_float4(position-convert_double4(posq[index]));\n";
-//                            else
-//                                compute << "posDelta[index] = position-posq[index];\n";
-//                        }
-//                        else
-//                            compute << "posq[index] = " << convert << "position);\n";
-//                    }
-//                    else if (variable[j] == "v")
-//                        compute << "velm[index] = " << convert << "velocity);\n";
-//                    else {
-//                        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
-//                            const CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
-//                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index] = perDofx"<<cu.intToString(i+1)<<";\n";
-//                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index+1] = perDofy"<<cu.intToString(i+1)<<";\n";
-//                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index+2] = perDofz"<<cu.intToString(i+1)<<";\n";
-//                        }
-//                    }
-//                    compute << "}\n";
-//                    numGaussian += numAtoms*usesVariable(expression[j], "gaussian");
-//                    numUniform += numAtoms*usesVariable(expression[j], "uniform");
-//                }
-//                map<string, string> replacements;
-//                replacements["COMPUTE_STEP"] = compute.str();
-//                stringstream args;
-//                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
-//                    const CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
-//                    string valueName = "perDofValues"+cu.intToString(i+1);
-//                    args << ", __global " << buffer.getType() << "* restrict " << valueName;
-//                }
-//                replacements["PARAMETER_ARGUMENTS"] = args.str();
-//                if (loadPosAsDelta[step])
-//                    defines["LOAD_POS_AS_DELTA"] = "1";
-//                else if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
-//                    defines.erase("LOAD_POS_AS_DELTA");
-//                CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines);
-//                cu::Kernel kernel = cu.getKernel(module, "computePerDof");
-//                kernels[step].push_back(kernel);
-//                requiredGaussian[step] = numGaussian;
-//                requiredUniform[step] = numUniform;
-//                int index = 0;
-//                kernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, integration.getPosDelta().getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, cu.getVelm().getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, cu.getForce().getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, integration.getStepSize().getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, globalValues->getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, contextParameterValues->getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, sumBuffer->getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, integration.getRandom().getDevicePointer());
-//                index++;
-//                kernel.setArg<cu::Buffer>(index++, uniformRandoms->getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, energy->getDevicePointer());
-//                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
-//                    kernel.setArg<cu::Memory>(index++, perDofValues->getBuffers()[i].getMemory());
-//                if (stepType[step] == CustomIntegrator::ComputeSum) {
-//                    // Create a second kernel for this step that sums the values.
-//
-//                    module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-//                    kernel = cu.getKernel(module, "computeSum");
-//                    kernels[step].push_back(kernel);
-//                    index = 0;
-//                    kernel.setArg<cu::Buffer>(index++, sumBuffer->getDevicePointer());
-//                    bool found = false;
-//                    for (int j = 0; j < integrator.getNumGlobalVariables() && !found; j++)
-//                        if (variable[step] == integrator.getGlobalVariableName(j)) {
-//                            kernel.setArg<cu::Buffer>(index++, globalValues->getDevicePointer());
-//                            kernel.setArg<cl_uint>(index++, j);
-//                            found = true;
-//                        }
-//                    for (int j = 0; j < (int) parameterNames.size() && !found; j++)
-//                        if (variable[step] == parameterNames[j]) {
-//                            kernel.setArg<cu::Buffer>(index++, contextParameterValues->getDevicePointer());
-//                            kernel.setArg<cl_uint>(index++, j);
-//                            found = true;
-//                            modifiesParameters = true;
-//                        }
-//                    if (!found)
-//                        throw OpenMMException("Unknown global variable: "+variable[step]);
-//                    kernel.setArg<cl_int>(index++, 3*numAtoms);
-//                }
-//            }
-//            else if (stepType[step] == CustomIntegrator::ComputeGlobal && !merged[step]) {
-//                // Compute a global value.
-//
-//                stringstream compute;
-//                for (int i = step; i < numSteps && (i == step || merged[i]); i++)
-//                    compute << "{\n" << createGlobalComputation(variable[i], expression[i], integrator, energyName[i]) << "}\n";
-//                map<string, string> replacements;
-//                replacements["COMPUTE_STEP"] = compute.str();
-//                CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorGlobal, replacements), defines);
-//                cu::Kernel kernel = cu.getKernel(module, "computeGlobal");
-//                kernels[step].push_back(kernel);
-//                int index = 0;
-//                kernel.setArg<cu::Buffer>(index++, integration.getStepSize().getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, globalValues->getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, contextParameterValues->getDevicePointer());
-//                index += 2;
-//                kernel.setArg<cu::Buffer>(index++, energy->getDevicePointer());
-//            }
-//            else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
-//                // Apply position constraints.
-//
-//                CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-//                cu::Kernel kernel = cu.getKernel(module, "applyPositionDeltas");
-//                kernels[step].push_back(kernel);
-//                int index = 0;
-//                kernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
-//                kernel.setArg<cu::Buffer>(index++, integration.getPosDelta().getDevicePointer());
-//            }
-//        }
-//        
-//        // Create the kernel for summing energy.
-//
-//        CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-//        sumEnergyKernel = cu.getKernel(module, "computeSum");
-//        int index = 0;
-//        sumEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
-//        sumEnergyKernel.setArg<cu::Buffer>(index++, energy->getDevicePointer());
-//        sumEnergyKernel.setArg<cl_int>(index++, 0);
-//        sumEnergyKernel.setArg<cl_int>(index++, cu.getEnergyBuffer().getSize());
-//    }
-//    
-//    // Make sure all values (variables, parameters, etc.) stored on the device are up to date.
-//    
-//    if (!deviceValuesAreCurrent) {
-//        perDofValues->setParameterValues(localPerDofValues);
-//        deviceValuesAreCurrent = true;
-//    }
-//    localValuesAreCurrent = false;
-//    double stepSize = integrator.getStepSize();
-//    if (stepSize != prevStepSize) {
-//        integration.getStepSize()[0].y = (cl_float) stepSize;
-//        integration.getStepSize().upload();
-//        prevStepSize = stepSize;
-//    }
-//    bool paramsChanged = false;
-//    for (int i = 0; i < (int) parameterNames.size(); i++) {
-//        float value = (float) context.getParameter(parameterNames[i]);
-//        if (value != contextParameterValues->get(i)) {
-//            contextParameterValues->set(i, value);
-//            paramsChanged = true;
-//        }
-//    }
-//    if (paramsChanged)
-//        contextParameterValues->upload();
-//
-//    // Loop over computation steps in the integrator and execute them.
-//
-//    for (int i = 0; i < numSteps; i++) {
-//        if ((needsForces[i] || needsEnergy[i]) && (!forcesAreValid || context.getLastForceGroups() != forceGroup[i])) {
-//            // Recompute forces and/or energy.  Figure out what is actually needed
-//            // between now and the next time they get invalidated again.
-//            
-//            bool computeForce = false, computeEnergy = false;
-//            for (int j = i; ; j++) {
-//                if (needsForces[j])
-//                    computeForce = true;
-//                if (needsEnergy[j])
-//                    computeEnergy = true;
-//                if (invalidatesForces[j])
-//                    break;
-//                if (j == numSteps-1)
-//                    j = -1;
-//                if (j == i-1)
-//                    break;
-//            }
-//            recordChangedParameters(context);
-//            context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
-//            if (computeEnergy)
-//                cu.executeKernel(sumEnergyKernel, CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
-//            forcesAreValid = true;
-//        }
-//        if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
-//            kernels[i][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[i]));
-//            if (requiredUniform[i] > 0)
-//                cu.executeKernel(randomKernel, numAtoms);
-//            cu.executeKernel(kernels[i][0], numAtoms);
-//        }
-//        else if (stepType[i] == CustomIntegrator::ComputeGlobal && !merged[i]) {
-//            kernels[i][0].setArg<cl_float>(3, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
-//            kernels[i][0].setArg<cl_float>(4, SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
-//            cu.executeKernel(kernels[i][0], 1, 1);
-//        }
-//        else if (stepType[i] == CustomIntegrator::ComputeSum) {
-//            kernels[i][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[i]));
-//            if (requiredUniform[i] > 0)
-//                cu.executeKernel(randomKernel, numAtoms);
-//            cu.executeKernel(kernels[i][0], numAtoms);
-//            cu.executeKernel(kernels[i][1], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
-//        }
-//        else if (stepType[i] == CustomIntegrator::UpdateContextState) {
-//            recordChangedParameters(context);
-//            context.updateContextState();
-//        }
-//        else if (stepType[i] == CustomIntegrator::ConstrainPositions) {
-//            cu.getIntegrationUtilities().applyConstraints(integrator.getConstraintTolerance());
-//            cu.executeKernel(kernels[i][0], numAtoms);
-//            cu.getIntegrationUtilities().computeVirtualSites();
-//        }
-//        else if (stepType[i] == CustomIntegrator::ConstrainVelocities) {
-//            cu.getIntegrationUtilities().applyVelocityConstraints(integrator.getConstraintTolerance());
-//        }
-//        if (invalidatesForces[i])
-//            forcesAreValid = false;
-//    }
-//    recordChangedParameters(context);
-//
-//    // Update the time and step count.
-//
-//    cu.setTime(cu.getTime()+stepSize);
-//    cu.setStepCount(cu.getStepCount()+1);
-//}
-//
-//void CudaIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
-//    if (!modifiesParameters)
-//        return;
-//    contextParameterValues->download();
-//    for (int i = 0; i < (int) parameterNames.size(); i++) {
-//        float value = (float) context.getParameter(parameterNames[i]);
-//        if (value != contextParameterValues->get(i))
-//            context.setParameter(parameterNames[i], contextParameterValues->get(i));
-//    }
-//}
-//
-//void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vector<double>& values) const {
-//    globalValues->download();
-//    values.resize(numGlobalVariables);
-//    for (int i = 0; i < numGlobalVariables; i++)
-//        values[i] = globalValues->get(i);
-//}
-//
-//void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
-//    for (int i = 0; i < numGlobalVariables; i++)
-//        globalValues->set(i, (float) values[i]);
-//    globalValues->upload();
-//}
-//
-//void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
-//    if (!localValuesAreCurrent) {
-//        perDofValues->getParameterValues(localPerDofValues);
-//        localValuesAreCurrent = true;
-//    }
-//    values.resize(perDofValues->getNumObjects()/3);
-//    CudaArray<cl_int>& order = cu.getAtomIndex();
-//    for (int i = 0; i < (int) values.size(); i++)
-//        for (int j = 0; j < 3; j++)
-//            values[order[i]][j] = localPerDofValues[3*i+j][variable];
-//}
-//
-//void CudaIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) {
-//    if (!localValuesAreCurrent) {
-//        perDofValues->getParameterValues(localPerDofValues);
-//        localValuesAreCurrent = true;
-//    }
-//    CudaArray<cl_int>& order = cu.getAtomIndex();
-//    for (int i = 0; i < (int) values.size(); i++)
-//        for (int j = 0; j < 3; j++)
-//            localPerDofValues[3*i+j][variable] = (float) values[order[i]][j];
-//    deviceValuesAreCurrent = false;
-//}
+void CudaIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) {
+    const vector<int>& order = cu.getAtomIndex();
+    if (cu.getUseDoublePrecision()) {
+        if (!localValuesAreCurrent) {
+            perDofValues->getParameterValues(localPerDofValuesDouble);
+            localValuesAreCurrent = true;
+        }
+        for (int i = 0; i < (int) values.size(); i++)
+            for (int j = 0; j < 3; j++)
+                localPerDofValuesDouble[3*i+j][variable] = values[order[i]][j];
+    }
+    else {
+        if (!localValuesAreCurrent) {
+            perDofValues->getParameterValues(localPerDofValuesFloat);
+            localValuesAreCurrent = true;
+        }
+        for (int i = 0; i < (int) values.size(); i++)
+            for (int j = 0; j < 3; j++)
+                localPerDofValuesFloat[3*i+j][variable] = (float) values[order[i]][j];
+    }
+    deviceValuesAreCurrent = false;
+}

 CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() {
    cu.setAsCurrent();

--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
@@ -623,50 +623,49 @@ private:
    static const int PmeOrder = 5;
 };

-///**
-// * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
-// */
-//class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
-//public:
-//    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomNonbondedForceKernel(name, platform),
-//            hasInitializedKernel(false), cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
-//    }
-//    ~CudaCalcCustomNonbondedForceKernel();
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param force      the CustomNonbondedForce this kernel will be used for
-//     */
-//    void initialize(const System& system, const CustomNonbondedForce& force);
-//    /**
-//     * Execute the kernel to calculate the forces and/or energy.
-//     *
-//     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
-//     * @return the potential energy due to the force
-//     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//    /**
-//     * Copy changed parameters over to a context.
-//     *
-//     * @param context    the context to copy parameters to
-//     * @param force      the CustomNonbondedForce to copy the parameters from
-//     */
-//    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
-//private:
-//    bool hasInitializedKernel;
-//    CudaContext& cu;
-//    CudaParameterSet* params;
-//    CudaArray<cl_float>* globals;
-//    CudaArray<mm_float4>* tabulatedFunctionParams;
-//    std::vector<std::string> globalParamNames;
-//    std::vector<cl_float> globalParamValues;
-//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
-//    System& system;
-//};
-//
+/**
+ * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
+ */
+class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
+public:
+    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomNonbondedForceKernel(name, platform),
+            cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
+    }
+    ~CudaCalcCustomNonbondedForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomNonbondedForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomNonbondedForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomNonbondedForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
+private:
+    CudaContext& cu;
+    CudaParameterSet* params;
+    CudaArray* globals;
+    CudaArray* tabulatedFunctionParams;
+    std::vector<std::string> globalParamNames;
+    std::vector<float> globalParamValues;
+    std::vector<CudaArray*> tabulatedFunctions;
+    System& system;
+};
+
 ///**
 // * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
 // */
@@ -814,60 +813,58 @@ private:
    std::vector<float> globalParamValues;
 };

-///**
-// * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
-// */
-//class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
-//public:
-//    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomHbondForceKernel(name, platform),
-//            hasInitializedKernel(false), cu(cu), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
-//            donorBufferIndices(NULL), acceptorBufferIndices(NULL), globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL),
-//            tabulatedFunctionParams(NULL), system(system) {
-//    }
-//    ~CudaCalcCustomHbondForceKernel();
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param force      the CustomHbondForce this kernel will be used for
-//     */
-//    void initialize(const System& system, const CustomHbondForce& force);
-//    /**
-//     * Execute the kernel to calculate the forces and/or energy.
-//     *
-//     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
-//     * @return the potential energy due to the force
-//     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//    /**
-//     * Copy changed parameters over to a context.
-//     *
-//     * @param context    the context to copy parameters to
-//     * @param force      the CustomHbondForce to copy the parameters from
-//     */
-//    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
-//private:
-//    int numDonors, numAcceptors;
-//    bool hasInitializedKernel;
-//    CudaContext& cu;
-//    CudaParameterSet* donorParams;
-//    CudaParameterSet* acceptorParams;
-//    CudaArray<cl_float>* globals;
-//    CudaArray<mm_int4>* donors;
-//    CudaArray<mm_int4>* acceptors;
-//    CudaArray<mm_int4>* donorBufferIndices;
-//    CudaArray<mm_int4>* acceptorBufferIndices;
-//    CudaArray<mm_int4>* donorExclusions;
-//    CudaArray<mm_int4>* acceptorExclusions;
-//    CudaArray<mm_float4>* tabulatedFunctionParams;
-//    std::vector<std::string> globalParamNames;
-//    std::vector<cl_float> globalParamValues;
-//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
-//    System& system;
-//    CUfunction donorKernel, acceptorKernel;
-//};
+/**
+ * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
+ */
+class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
+public:
+    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomHbondForceKernel(name, platform),
+            hasInitializedKernel(false), cu(cu), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
+            globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL), tabulatedFunctionParams(NULL), system(system) {
+    }
+    ~CudaCalcCustomHbondForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomHbondForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomHbondForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomHbondForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
+private:
+    int numDonors, numAcceptors;
+    bool hasInitializedKernel;
+    CudaContext& cu;
+    CudaParameterSet* donorParams;
+    CudaParameterSet* acceptorParams;
+    CudaArray* globals;
+    CudaArray* donors;
+    CudaArray* acceptors;
+    CudaArray* donorExclusions;
+    CudaArray* acceptorExclusions;
+    CudaArray* tabulatedFunctionParams;
+    std::vector<std::string> globalParamNames;
+    std::vector<float> globalParamValues;
+    std::vector<CudaArray*> tabulatedFunctions;
+    std::vector<void*> donorArgs, acceptorArgs;
+    System& system;
+    CUfunction donorKernel, acceptorKernel;
+};

 /**
 * This kernel is invoked by CustomCompoundBondForce to calculate the forces acting on the system.
@@ -1062,94 +1059,98 @@ private:
    double prevTemp, prevFriction, prevErrorTol;
 };

-///**
-// * This kernel is invoked by CustomIntegrator to take one time step.
-// */
-//class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
-//public:
-//    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
-//            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
-//            uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
-//    }
-//    ~CudaIntegrateCustomStepKernel();
-//    /**
-//     * Initialize the kernel.
-//     * 
-//     * @param system     the System this kernel will be applied to
-//     * @param integrator the CustomIntegrator this kernel will be used for
-//     */
-//    void initialize(const System& system, const CustomIntegrator& integrator);
-//    /**
-//     * Execute the kernel.
-//     * 
-//     * @param context    the context in which to execute this kernel
-//     * @param integrator the CustomIntegrator this kernel is being used for
-//     * @param forcesAreValid if the context has been modified since the last time step, this will be
-//     *                       false to show that cached forces are invalid and must be recalculated.
-//     *                       On exit, this should specify whether the cached forces are valid at the
-//     *                       end of the step.
-//     */
-//    void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
-//    /**
-//     * Get the values of all global variables.
-//     *
-//     * @param context   the context in which to execute this kernel
-//     * @param values    on exit, this contains the values
-//     */
-//    void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
-//    /**
-//     * Set the values of all global variables.
-//     *
-//     * @param context   the context in which to execute this kernel
-//     * @param values    a vector containing the values
-//     */
-//    void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
-//    /**
-//     * Get the values of a per-DOF variable.
-//     *
-//     * @param context   the context in which to execute this kernel
-//     * @param variable  the index of the variable to get
-//     * @param values    on exit, this contains the values
-//     */
-//    void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
-//    /**
-//     * Set the values of a per-DOF variable.
-//     *
-//     * @param context   the context in which to execute this kernel
-//     * @param variable  the index of the variable to get
-//     * @param values    a vector containing the values
-//     */
-//    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
-//private:
-//    class ReorderListener;
-//    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
-//    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
-//    void recordChangedParameters(ContextImpl& context);
-//    CudaContext& cu;
-//    double prevStepSize;
-//    int numGlobalVariables;
-//    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
-//    mutable bool localValuesAreCurrent;
-//    CudaArray<cl_float>* globalValues;
-//    CudaArray<cl_float>* contextParameterValues;
-//    CudaArray<cl_float>* sumBuffer;
-//    CudaArray<cl_float>* energy;
-//    CudaArray<mm_float4>* uniformRandoms;
-//    CudaArray<mm_int4>* randomSeed;
-//    CudaParameterSet* perDofValues;
-//    mutable std::vector<std::vector<cl_float> > localPerDofValues;
-//    std::vector<std::vector<CUfunction> > kernels;
-//    CUfunction sumEnergyKernel, randomKernel;
-//    std::vector<CustomIntegrator::ComputationType> stepType;
-//    std::vector<bool> needsForces;
-//    std::vector<bool> needsEnergy;
-//    std::vector<bool> invalidatesForces;
-//    std::vector<bool> merged;
-//    std::vector<int> forceGroup;
-//    std::vector<int> requiredGaussian;
-//    std::vector<int> requiredUniform;
-//    std::vector<std::string> parameterNames;
-//};
+/**
+ * This kernel is invoked by CustomIntegrator to take one time step.
+ */
+class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
+public:
+    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
+            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
+            uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
+    }
+    ~CudaIntegrateCustomStepKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the CustomIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const CustomIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the CustomIntegrator this kernel is being used for
+     * @param forcesAreValid if the context has been modified since the last time step, this will be
+     *                       false to show that cached forces are invalid and must be recalculated.
+     *                       On exit, this should specify whether the cached forces are valid at the
+     *                       end of the step.
+     */
+    void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
+    /**
+     * Get the values of all global variables.
+     *
+     * @param context   the context in which to execute this kernel
+     * @param values    on exit, this contains the values
+     */
+    void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
+    /**
+     * Set the values of all global variables.
+     *
+     * @param context   the context in which to execute this kernel
+     * @param values    a vector containing the values
+     */
+    void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
+    /**
+     * Get the values of a per-DOF variable.
+     *
+     * @param context   the context in which to execute this kernel
+     * @param variable  the index of the variable to get
+     * @param values    on exit, this contains the values
+     */
+    void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
+    /**
+     * Set the values of a per-DOF variable.
+     *
+     * @param context   the context in which to execute this kernel
+     * @param variable  the index of the variable to get
+     * @param values    a vector containing the values
+     */
+    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
+private:
+    class ReorderListener;
+    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
+    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
+    void recordChangedParameters(ContextImpl& context);
+    CudaContext& cu;
+    double prevStepSize;
+    int numGlobalVariables;
+    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
+    mutable bool localValuesAreCurrent;
+    CudaArray* globalValues;
+    CudaArray* contextParameterValues;
+    CudaArray* sumBuffer;
+    CudaArray* energy;
+    CudaArray* uniformRandoms;
+    CudaArray* randomSeed;
+    CudaParameterSet* perDofValues;
+    mutable std::vector<std::vector<float> > localPerDofValuesFloat;
+    mutable std::vector<std::vector<double> > localPerDofValuesDouble;
+    std::vector<float> contextValuesFloat;
+    std::vector<double> contextValuesDouble;
+    std::vector<std::vector<CUfunction> > kernels;
+    std::vector<std::vector<std::vector<void*> > > kernelArgs;
+    CUfunction sumEnergyKernel, randomKernel;
+    std::vector<CustomIntegrator::ComputationType> stepType;
+    std::vector<bool> needsForces;
+    std::vector<bool> needsEnergy;
+    std::vector<bool> invalidatesForces;
+    std::vector<bool> merged;
+    std::vector<int> forceGroup;
+    std::vector<int> requiredGaussian;
+    std::vector<int> requiredUniform;
+    std::vector<std::string> parameterNames;
+};

 /**
 * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.

--- a/platforms/cuda2/src/CudaParameterSet.cpp
+++ b/platforms/cuda2/src/CudaParameterSet.cpp
@@ -39,11 +39,12 @@ using namespace std;
        throw OpenMMException(m.str());\
    }

-CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter) :
+CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
            context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
    int params = numParameters;
    int bufferCount = 0;
-    int elementSize = 4;
+    elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
+    string elementType = (useDoublePrecision ? "double" : "float");
    CUdeviceptr pointer;
    string errorMessage = "Error creating parameter set "+name;
    if (!bufferPerParameter) {
@@ -51,14 +52,14 @@ CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*4));
            std::stringstream name;
            name << "param" << (++bufferCount);
-            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 4, elementSize*4, pointer));
+            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, pointer));
            params -= 4;
        }
        if (params > 1) {
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*2));
            std::stringstream name;
            name << "param" << (++bufferCount);
-            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 2, elementSize*2, pointer));
+            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, pointer));
            params -= 2;
        }
    }
@@ -66,50 +67,55 @@ CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize));
        std::stringstream name;
        name << "param" << (++bufferCount);
-        buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 1, elementSize, pointer));
+        buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, pointer));
        params--;
    }
 }

 CudaParameterSet::~CudaParameterSet() {
-    string errorMessage = "Error freeing device memory";
-    for (int i = 0; i < (int) buffers.size(); i++)
-        CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
+    if (context.getContextIsValid()) {
+        string errorMessage = "Error freeing device memory";
+        for (int i = 0; i < (int) buffers.size(); i++)
+            CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
+    }
 }

-void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
+template <class T>
+void CudaParameterSet::getParameterValues(vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called getParameterValues() with vector of wrong type");
    values.resize(numObjects);
    for (int i = 0; i < numObjects; i++)
        values[i].resize(numParameters);
    int base = 0;
    string errorMessage = "Error downloading parameter set "+name;
    for (int i = 0; i < (int) buffers.size(); i++) {
-        if (buffers[i].getType() == "float4") {
-            vector<float4> data(numObjects);
+        if (buffers[i].getSize() == 4*elementSize) {
+            vector<T> data(4*numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++) {
-                values[j][base] = data[j].x;
+                values[j][base] = data[4*j];
                if (base+1 < numParameters)
-                    values[j][base+1] = data[j].y;
+                    values[j][base+1] = data[4*j+1];
                if (base+2 < numParameters)
-                    values[j][base+2] = data[j].z;
+                    values[j][base+2] = data[4*j+2];
                if (base+3 < numParameters)
-                    values[j][base+3] = data[j].w;
+                    values[j][base+3] = data[4*j+3];
            }
            base += 4;
        }
-        else if (buffers[i].getType() == "float2") {
-            vector<float2> data(numObjects);
+        else if (buffers[i].getSize() == 2*elementSize) {
+            vector<T> data(2*numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++) {
-                values[j][base] = data[j].x;
+                values[j][base] = data[2*j];
                if (base+1 < numParameters)
-                    values[j][base+1] = data[j].y;
+                    values[j][base+1] = data[2*j+1];
            }
            base += 2;
        }
-        else if (buffers[i].getType() == "float") {
-            vector<float> data(numObjects);
+        else if (buffers[i].getSize() == elementSize) {
+            vector<T> data(numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++)
                values[j][base] = data[j];
@@ -120,36 +126,39 @@ void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
    }
 }

-void CudaParameterSet::setParameterValues(const vector<vector<float> >& values) {
+template <class T>
+void CudaParameterSet::setParameterValues(const vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called setParameterValues() with vector of wrong type");
    int base = 0;
    string errorMessage = "Error uploading parameter set "+name;
    for (int i = 0; i < (int) buffers.size(); i++) {
-        if (buffers[i].getType() == "float4") {
-            vector<float4> data(numObjects);
+        if (buffers[i].getSize() == 4*elementSize) {
+            vector<T> data(4*numObjects);
            for (int j = 0; j < numObjects; j++) {
-                data[j].x = values[j][base];
+                data[4*j] = values[j][base];
                if (base+1 < numParameters)
-                    data[j].y = values[j][base+1];
+                    data[4*j+1] = values[j][base+1];
                if (base+2 < numParameters)
-                    data[j].z = values[j][base+2];
+                    data[4*j+2] = values[j][base+2];
                if (base+3 < numParameters)
-                    data[j].w = values[j][base+3];
+                    data[4*j+3] = values[j][base+3];
            }
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
            base += 4;
        }
-        else if (buffers[i].getType() == "float2") {
-            vector<float2> data(numObjects);
+        else if (buffers[i].getSize() == 2*elementSize) {
+            vector<T> data(2*numObjects);
            for (int j = 0; j < numObjects; j++) {
-                data[j].x = values[j][base];
+                data[2*j] = values[j][base];
                if (base+1 < numParameters)
-                    data[j].y = values[j][base+1];
+                    data[2*j+1] = values[j][base+1];
            }
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
            base += 2;
        }
-        else if (buffers[i].getType() == "float") {
-            vector<float> data(numObjects);
+        else if (buffers[i].getSize() == elementSize) {
+            vector<T> data(numObjects);
            for (int j = 0; j < numObjects; j++)
                data[j] = values[j][base];
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
@@ -164,16 +173,26 @@ string CudaParameterSet::getParameterSuffix(int index, const std::string& extraS
    const string suffixes[] = {".x", ".y", ".z", ".w"};
    int buffer = -1;
    for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
-        if (index*sizeof(float) < buffers[i].getSize())
+        if (index*elementSize < buffers[i].getSize())
            buffer = i;
        else
-            index -= buffers[i].getSize()/sizeof(float);
+            index -= buffers[i].getSize()/elementSize;
    }
    if (buffer == -1)
        throw OpenMMException("Internal error: Illegal argument to CudaParameterSet::getParameterSuffix() ("+name+")");
    stringstream suffix;
    suffix << (buffer+1) << extraSuffix;
-    if (buffers[buffer].getType() != "float")
+    if (buffers[buffer].getSize() != elementSize)
        suffix << suffixes[index];
    return suffix.str();
 }
+
+/**
+ * Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
+ */
+namespace OpenMM {
+template void CudaParameterSet::getParameterValues<float>(vector<vector<float> >& values);
+template void CudaParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
+template void CudaParameterSet::getParameterValues<double>(vector<vector<double> >& values);
+template void CudaParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
+}
\ No newline at end of file
--- a/platforms/cuda2/src/CudaParameterSet.h
+++ b/platforms/cuda2/src/CudaParameterSet.h
@@ -51,8 +51,9 @@ public:
     * @param name             the name of the parameter set
     * @param bufferPerParameter  if true, a separate buffer is created for each parameter.  If false,
     *                            multiple parameters may be combined into a single buffer.
+     * @param useDoublePrecision  whether values should be stored as single or double precision
     */
-    CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false);
+    CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
    ~CudaParameterSet();
    /**
     * Get the number of parameters.
@@ -71,13 +72,15 @@ public:
     *
     * @param values on exit, values[i][j] contains the value of parameter j for object i
     */
-    void getParameterValues(std::vector<std::vector<float> >& values);
+    template <class T>
+    void getParameterValues(std::vector<std::vector<T> >& values);
    /**
     * Set the values of all parameters.
     *
     * @param values values[i][j] contains the value of parameter j for object i
     */
-    void setParameterValues(const std::vector<std::vector<float> >& values);
+    template <class T>
+    void setParameterValues(const std::vector<std::vector<T> >& values);
    /**
     * Get a set of CudaNonbondedUtilities::ParameterInfo objects which describe the Buffers
     * containing the data.
@@ -95,8 +98,7 @@ public:
    std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
 private:
    CudaContext& context;
-    int numParameters;
-    int numObjects;
+    int numParameters, numObjects, elementSize;
    std::string name;
    std::vector<CudaNonbondedUtilities::ParameterInfo> buffers;
 };

--- a/platforms/cuda2/src/kernels/customHbondForce.cu
+++ b/platforms/cuda2/src/kernels/customHbondForce.cu
+/**
+ * Convert a real4 to a real3 by removing its last element.
+ */
+inline __device__ real3 trim(real4 v) {
+    return make_real3(v.x, v.y, v.z);
+}
+
+/**
+ * This does nothing, and just exists to simply the code generation.
+ */
+inline __device__ real3 trim(real3 v) {
+    return v;
+}
+
+/**
+ * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 delta(real4 vec1, real4 vec2) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
+    return result;
+}
+
+/**
+ * Compute the difference between two vectors, taking periodic boundary conditions into account
+ * and setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+#ifdef USE_PERIODIC
+    result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+    result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+    result.z -= floor(result.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
+    return result;
+}
+
+/**
+ * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
+ */
+inline __device__ real computeAngle(real4 vec1, real4 vec2) {
+    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
+    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
+    real angle;
+    if (cosine > 0.99f || cosine < -0.99f) {
+        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
+
+        real3 crossProduct = cross(vec1, vec2);
+        real scale = vec1.w*vec2.w;
+        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        if (cosine < 0.0f)
+            angle = M_PI-angle;
+    }
+    else
+       angle = acos(cosine);
+    return angle;
+}
+
+/**
+ * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
+    real3 result = cross(vec1, vec2);
+    return make_real4(result.x, result.y, result.z, result.x*result.x + result.y*result.y + result.z*result.z);
+}
+
+/**
+ * Compute forces on donors.
+ */
+extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize
+        PARAMETER_ARGUMENTS) {
+    extern __shared__ real4 posBuffer[];
+    real energy = 0;
+    real3 f1 = make_real3(0);
+    real3 f2 = make_real3(0);
+    real3 f3 = make_real3(0);
+    for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x*gridDim.x) {
+        // Load information about the donor this thread will compute forces on.
+
+        int donorIndex = donorStart+blockIdx.x*blockDim.x+threadIdx.x;
+        int4 atoms, exclusionIndices;
+        real4 d1, d2, d3;
+        if (donorIndex < NUM_DONORS) {
+            atoms = donorAtoms[donorIndex];
+            d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
+            d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
+            d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
+#ifdef USE_EXCLUSIONS
+            exclusionIndices = exclusions[donorIndex];
+#endif
+        }
+        else
+            atoms = make_int4(-1, -1, -1, -1);
+        for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
+            // Load the next block of acceptors into local memory.
+
+            int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
+            if (threadIdx.x < blockSize) {
+                int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
+                posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
+                posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
+                posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
+            }
+            __syncthreads();
+            if (donorIndex < NUM_DONORS) {
+                for (int index = 0; index < blockSize; index++) {
+#ifdef USE_EXCLUSIONS
+                    int acceptorIndex = acceptorStart+index;
+                    if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
+                        continue;
+#endif
+                    // Compute the interaction between a donor and an acceptor.
+
+                    real4 a1 = posBuffer[3*index];
+                    real4 a2 = posBuffer[3*index+1];
+                    real4 a3 = posBuffer[3*index+2];
+                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
+#ifdef USE_CUTOFF
+                    if (deltaD1A1.w < CUTOFF_SQUARED) {
+#endif
+                        COMPUTE_DONOR_FORCE
+#ifdef USE_CUTOFF
+                    }
+#endif
+                }
+            }
+        }
+
+        // Write results
+
+        if (donorIndex < NUM_DONORS) {
+            if (atoms.x > -1) {
+                atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.y > -1) {
+                atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.z > -1) {
+                atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+        }
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
+/**
+ * Compute forces on acceptors.
+ */
+extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize
+        PARAMETER_ARGUMENTS) {
+    extern __shared__ real4 posBuffer[];
+    real3 f1 = make_real3(0);
+    real3 f2 = make_real3(0);
+    real3 f3 = make_real3(0);
+    for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x*gridDim.x) {
+        // Load information about the acceptor this thread will compute forces on.
+
+        int acceptorIndex = acceptorStart+blockIdx.x*blockDim.x+threadIdx.x;
+        int4 atoms, exclusionIndices;
+        real4 a1, a2, a3;
+        if (acceptorIndex < NUM_ACCEPTORS) {
+            atoms = acceptorAtoms[acceptorIndex];
+            a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
+            a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
+            a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
+#ifdef USE_EXCLUSIONS
+            exclusionIndices = exclusions[acceptorIndex];
+#endif
+        }
+        else
+            atoms = make_int4(-1, -1, -1, -1);
+        for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
+            // Load the next block of donors into local memory.
+
+            int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
+            if (threadIdx.x < blockSize) {
+                int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
+                posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
+                posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
+                posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
+            }
+            __syncthreads();
+            if (acceptorIndex < NUM_ACCEPTORS) {
+                for (int index = 0; index < blockSize; index++) {
+#ifdef USE_EXCLUSIONS
+                    int donorIndex = donorStart+index;
+                    if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
+                        continue;
+#endif
+                    // Compute the interaction between a donor and an acceptor.
+
+                    real4 d1 = posBuffer[3*index];
+                    real4 d2 = posBuffer[3*index+1];
+                    real4 d3 = posBuffer[3*index+2];
+                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
+#ifdef USE_CUTOFF
+                    if (deltaD1A1.w < CUTOFF_SQUARED) {
+#endif
+                        COMPUTE_ACCEPTOR_FORCE
+#ifdef USE_CUTOFF
+                    }
+#endif
+                }
+            }
+        }
+
+        // Write results
+
+        if (acceptorIndex < NUM_ACCEPTORS) {
+            if (atoms.x > -1) {
+                atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.y > -1) {
+                atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.z > -1) {
+                atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+        }
+    }
+}
--- a/platforms/cuda2/src/kernels/customIntegrator.cu
+++ b/platforms/cuda2/src/kernels/customIntegrator.cu
+extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real* result) {
+    __shared__ real tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = threadIdx.x;
+    real sum = 0;
+    for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
+        sum += sumBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
+}
+
+extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posDelta) {
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 position = posq[index];
+        position.x += posDelta[index].x;
+        position.y += posDelta[index].y;
+        position.z += posDelta[index].z;
+        posq[index] = position;
+        posDelta[index] = make_real4(0, 0, 0, 0);
+    }
+}
+
+extern "C" __global__ void generateRandomNumbers(float4* __restrict__ random, uint4* __restrict__ seed) {
+    uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x];
+    unsigned int carry = 0;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        // Generate three uniform random numbers.
+
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        unsigned int m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x2 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+
+        // Record the values.
+
+        random[index] = make_float4(x1, x2, x3, 0.0f);
+    }
+    seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
+}
--- a/platforms/cuda2/src/kernels/customIntegratorGlobal.cu
+++ b/platforms/cuda2/src/kernels/customIntegratorGlobal.cu
+extern "C" __global__ void computeGlobal(real2* __restrict__ dt, real* __restrict__ globals, real* __restrict__ params,
+        float uniform, float gaussian, const real* __restrict__ energy) {
+    COMPUTE_STEP
+}
--- a/platforms/cuda2/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda2/src/kernels/customIntegratorPerDof.cu
+inline __device__ double4 convertToDouble4(real4 a) {
+    return make_double4(a.x, a.y, a.z, a.w);
+}
+
+inline __device__ real4 convertFromDouble4(double4 a) {
+    return make_real4(a.x, a.y, a.z, a.w);
+}
+
+extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posDelta, real4* __restrict__ velm,
+        const long long* __restrict__ force, const real2* __restrict__ dt, const real* __restrict__ globals,
+        const real* __restrict__ params, real* __restrict__ sum, const float4* __restrict__ gaussianValues,
+        unsigned int randomIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
+        PARAMETER_ARGUMENTS) {
+    real stepSize = dt[0].y;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    randomIndex += index;
+    const double forceScale = 1.0/0xFFFFFFFF;
+    while (index < NUM_ATOMS) {
+#ifdef LOAD_POS_AS_DELTA
+        double4 position = convertToDouble4(posq[index]+posDelta[index]);
+#else
+        double4 position = convertToDouble4(posq[index]);
+#endif
+        double4 velocity = convertToDouble4(velm[index]);
+        double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);
+        double mass = 1.0/velocity.w;
+        if (velocity.w != 0.0) {
+            float4 gaussian = gaussianValues[randomIndex];
+            float4 uniform = uniformValues[index];
+            COMPUTE_STEP
+        }
+        randomIndex += blockDim.x*gridDim.x;
+        index += blockDim.x*gridDim.x;
+    }
+}
--- a/platforms/cuda2/src/kernels/customNonbonded.cu
+++ b/platforms/cuda2/src/kernels/customNonbonded.cu
+#ifdef USE_CUTOFF
+if (!isExcluded && r2 < CUTOFF_SQUARED) {
+#else
+if (!isExcluded) {
+#endif
+    real tempForce = 0;
+    COMPUTE_FORCE
+    dEdR += tempForce*invR;
+}
--- a/platforms/cuda2/tests/TestCudaCustomHbondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomHbondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of CustomHbondForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/CustomHbondForce.h"
+#include "openmm/HarmonicAngleForce.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/PeriodicTorsionForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void testHbond() {
+    CudaPlatform platform;
+
+    // Create a system using a CustomHbondForce.
+
+    System customSystem;
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    CustomHbondForce* custom = new CustomHbondForce("0.5*kr*(distance(d1,a1)-r0)^2 + 0.5*ktheta*(angle(a1,d1,d2)-theta0)^2 + 0.5*kpsi*(angle(d1,a1,a2)-psi0)^2 + kchi*(1+cos(n*dihedral(a3,a2,a1,d1)-chi0))");
+    custom->addPerDonorParameter("r0");
+    custom->addPerDonorParameter("theta0");
+    custom->addPerDonorParameter("psi0");
+    custom->addPerAcceptorParameter("chi0");
+    custom->addPerAcceptorParameter("n");
+    custom->addGlobalParameter("kr", 0.4);
+    custom->addGlobalParameter("ktheta", 0.5);
+    custom->addGlobalParameter("kpsi", 0.6);
+    custom->addGlobalParameter("kchi", 0.7);
+    vector<double> parameters(3);
+    parameters[0] = 1.5;
+    parameters[1] = 1.7;
+    parameters[2] = 1.9;
+    custom->addDonor(1, 0, -1, parameters);
+    parameters.resize(2);
+    parameters[0] = 2.1;
+    parameters[1] = 2;
+    custom->addAcceptor(2, 3, 4, parameters);
+    custom->setCutoffDistance(10.0);
+    customSystem.addForce(custom);
+
+    // Create an identical system using HarmonicBondForce, HarmonicAngleForce, and PeriodicTorsionForce.
+
+    System standardSystem;
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    HarmonicBondForce* bond = new HarmonicBondForce();
+    bond->addBond(1, 2, 1.5, 0.4);
+    standardSystem.addForce(bond);
+    HarmonicAngleForce* angle = new HarmonicAngleForce();
+    angle->addAngle(0, 1, 2, 1.7, 0.5);
+    angle->addAngle(1, 2, 3, 1.9, 0.6);
+    standardSystem.addForce(angle);
+    PeriodicTorsionForce* torsion = new PeriodicTorsionForce();
+    torsion->addTorsion(1, 2, 3, 4, 2, 2.1, 0.7);
+    standardSystem.addForce(torsion);
+
+    // Set the atoms in various positions, and verify that both systems give identical forces and energy.
+
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<Vec3> positions(5);
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+    Context c1(customSystem, integrator1, platform);
+    Context c2(standardSystem, integrator2, platform);
+    for (int i = 0; i < 10; i++) {
+        for (int j = 0; j < (int) positions.size(); j++)
+            positions[j] = Vec3(2.0*genrand_real2(sfmt), 2.0*genrand_real2(sfmt), 2.0*genrand_real2(sfmt));
+        c1.setPositions(positions);
+        c2.setPositions(positions);
+        State s1 = c1.getState(State::Forces | State::Energy);
+        State s2 = c2.getState(State::Forces | State::Energy);
+        for (int i = 0; i < customSystem.getNumParticles(); i++)
+            ASSERT_EQUAL_VEC(s2.getForces()[i], s1.getForces()[i], TOL);
+        ASSERT_EQUAL_TOL(s2.getPotentialEnergy(), s1.getPotentialEnergy(), TOL);
+    }
+    
+    // Try changing the parameters and make sure it's still correct.
+    
+    parameters.resize(3);
+    parameters[0] = 1.4;
+    parameters[1] = 1.7;
+    parameters[2] = 1.9;
+    custom->setDonorParameters(0, 1, 0, -1, parameters);
+    parameters.resize(2);
+    parameters[0] = 2.2;
+    parameters[1] = 2;
+    custom->setAcceptorParameters(0, 2, 3, 4, parameters);
+    bond->setBondParameters(0, 1, 2, 1.4, 0.4);
+    torsion->setTorsionParameters(0, 1, 2, 3, 4, 2, 2.2, 0.7);
+    custom->updateParametersInContext(c1);
+    bond->updateParametersInContext(c2);
+    torsion->updateParametersInContext(c2);
+    State s1 = c1.getState(State::Forces | State::Energy);
+    State s2 = c2.getState(State::Forces | State::Energy);
+    for (int i = 0; i < customSystem.getNumParticles(); i++)
+        ASSERT_EQUAL_VEC(s2.getForces()[i], s1.getForces()[i], TOL);
+    ASSERT_EQUAL_TOL(s2.getPotentialEnergy(), s1.getPotentialEnergy(), TOL);
+}
+
+void testExclusions() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("(distance(d1,a1)-1)^2");
+    custom->addDonor(0, 1, -1, vector<double>());
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addAcceptor(2, 0, -1, vector<double>());
+    custom->addExclusion(1, 0);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(2, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-2, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.0, state.getPotentialEnergy(), TOL);
+}
+
+void testCutoff() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("(distance(d1,a1)-1)^2");
+    custom->addDonor(0, 1, -1, vector<double>());
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addAcceptor(2, 0, -1, vector<double>());
+    custom->setNonbondedMethod(CustomHbondForce::CutoffNonPeriodic);
+    custom->setCutoffDistance(2.5);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 3, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(2, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-2, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.0, state.getPotentialEnergy(), TOL);
+}
+
+void testCustomFunctions() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("foo(distance(d1,a1))");
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addDonor(2, 0, -1, vector<double>());
+    custom->addAcceptor(0, 1, -1, vector<double>());
+    vector<double> function(2);
+    function[0] = 0;
+    function[1] = 1;
+    custom->addFunction("foo", function, 0, 10);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0.1, 0.1, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, -0.1, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-0.1, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(0.1*2+0.1*2, state.getPotentialEnergy(), TOL);
+}
+
+int main() {
+    try {
+        testHbond();
+        testExclusions();
+        testCutoff();
+        testCustomFunctions();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cuda2/tests/TestCudaCustomIntegrator.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of CustomIntegrator.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/AndersenThermostat.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/CustomIntegrator.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+/**
+ * Test a simple leapfrog integrator on a single bond.
+ */
+void testSingleBond() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    system.addParticle(2.0);
+    CustomIntegrator integrator(0.01);
+    integrator.addComputePerDof("v", "v+dt*f/m");
+    integrator.addComputePerDof("x", "x+dt*v");
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 1.5, 1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    
+    // This is simply a harmonic oscillator, so compare it to the analytical solution.
+    
+    const double freq = 1.0;;
+    State state = context.getState(State::Energy);
+    const double initialEnergy = state.getKineticEnergy()+state.getPotentialEnergy();
+    for (int i = 0; i < 1000; ++i) {
+        state = context.getState(State::Positions | State::Velocities | State::Energy);
+        double time = state.getTime();
+        double expectedDist = 1.5+0.5*std::cos(freq*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedDist, 0, 0), state.getPositions()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedDist, 0, 0), state.getPositions()[1], 0.02);
+        double expectedSpeed = -0.5*freq*std::sin(freq*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedSpeed, 0, 0), state.getVelocities()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedSpeed, 0, 0), state.getVelocities()[1], 0.02);
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
+
+/**
+ * Test an integrator that enforces constraints.
+ */
+void testConstraints() {
+    const int numParticles = 8;
+    const double temp = 500.0;
+    CudaPlatform platform;
+    System system;
+    CustomIntegrator integrator(0.002);
+    integrator.addPerDofVariable("oldx", 0);
+    integrator.addComputePerDof("v", "v+dt*f/m");
+    integrator.addComputePerDof("oldx", "x");
+    integrator.addComputePerDof("x", "x+dt*v");
+    integrator.addConstrainPositions();
+    integrator.addComputePerDof("v", "(x-oldx)/dt");
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(i%2 == 0 ? 5.0 : 10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    for (int i = 0; i < numParticles-1; ++i)
+        system.addConstraint(i, i+1, 1.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3(i/2, (i+1)/2, 0);
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    
+    // Simulate it and see whether the constraints remain satisfied.
+    
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Energy);
+        for (int j = 0; j < system.getNumConstraints(); ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+        }
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        if (i == 1)
+            initialEnergy = energy;
+        else if (i > 1)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
+
+/**
+ * Test an integrator that applies constraints directly to velocities.
+ */
+void testVelocityConstraints() {
+    const int numParticles = 10;
+    CudaPlatform platform;
+    System system;
+    CustomIntegrator integrator(0.002);
+    integrator.addPerDofVariable("x1", 0);
+    integrator.addComputePerDof("v", "v+0.5*dt*f/m");
+    integrator.addComputePerDof("x", "x+dt*v");
+    integrator.addComputePerDof("x1", "x");
+    integrator.addConstrainPositions();
+    integrator.addComputePerDof("v", "v+0.5*dt*f/m+(x-x1)/dt");
+    integrator.addConstrainVelocities();
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(i%2 == 0 ? 5.0 : 10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    
+    // Constrain the first three particles with SHAKE.
+    
+    system.addConstraint(0, 1, 1.0);
+    system.addConstraint(1, 2, 1.0);
+    
+    // Constrain the next three with SETTLE.
+    
+    system.addConstraint(3, 4, 1.0);
+    system.addConstraint(5, 4, 1.0);
+    system.addConstraint(3, 5, sqrt(2.0));
+    
+    // Constraint the rest with CCMA.
+    
+    for (int i = 6; i < numParticles-1; ++i)
+        system.addConstraint(i, i+1, 1.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3(i/2, (i+1)/2, 0);
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    
+    // Simulate it and see whether the constraints remain satisfied.
+    
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        integrator.step(2);
+        State state = context.getState(State::Positions | State::Velocities | State::Energy);
+        for (int j = 0; j < system.getNumConstraints(); ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+            if (i > 0) {
+                Vec3 v1 = state.getVelocities()[particle1];
+                Vec3 v2 = state.getVelocities()[particle2];
+                double vel = (v1-v2).dot(p1-p2);
+                ASSERT_EQUAL_TOL(0.0, vel, 2e-5);
+            }
+        }
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        if (i == 0)
+            initialEnergy = energy;
+        else if (i > 0)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+    }
+}
+
+/**
+ * Test an integrator with an AndersenThermostat to see if updateContextState()
+ * is being handled correctly.
+ */
+void testWithThermostat() {
+    const int numParticles = 8;
+    const double temp = 100.0;
+    const double collisionFreq = 10.0;
+    const int numSteps = 10000;
+    CudaPlatform platform;
+    System system;
+    CustomIntegrator integrator(0.005);
+    integrator.addUpdateContextState();
+    integrator.addComputePerDof("v", "v+dt*f/m");
+    integrator.addComputePerDof("x", "x+dt*v");
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(2.0);
+        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
+    }
+    system.addForce(forceField);
+    AndersenThermostat* thermostat = new AndersenThermostat(temp, collisionFreq);
+    system.addForce(thermostat);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
+    context.setPositions(positions);
+    
+    // Let it equilibrate.
+    
+    integrator.step(10000);
+    
+    // Now run it for a while and see if the temperature is correct.
+    
+    double ke = 0.0;
+    for (int i = 0; i < numSteps; ++i) {
+        State state = context.getState(State::Energy);
+        ke += state.getKineticEnergy();
+        integrator.step(1);
+    }
+    ke /= numSteps;
+    double expected = 0.5*numParticles*3*BOLTZ*temp;
+    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
+}
+
+/**
+ * Test a Monte Carlo integrator that uses global variables and depends on energy.
+ */
+void testMonteCarlo() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    CustomIntegrator integrator(0.1);
+    const double kT = BOLTZ*300.0;
+    integrator.addGlobalVariable("kT", kT);
+    integrator.addGlobalVariable("oldE", 0);
+    integrator.addGlobalVariable("accept", 0);
+    integrator.addPerDofVariable("oldx", 0);
+    integrator.addComputeGlobal("oldE", "energy");
+    integrator.addComputePerDof("oldx", "x");
+    integrator.addComputePerDof("x", "x+dt*gaussian");
+    integrator.addComputeGlobal("accept", "step(exp((oldE-energy)/kT)-uniform)");
+    integrator.addComputePerDof("x", "accept*x + (1-accept)*oldx");
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 2.0, 10.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    
+    // Compute the histogram of distances and see if it satisfies a Boltzmann distribution.
+    
+    const int numBins = 100;
+    const double maxDist = 4.0;
+    const int numIterations = 5000;
+    vector<int> counts(numBins, 0);
+    for (int i = 0; i < numIterations; ++i) {
+        integrator.step(10);
+        State state = context.getState(State::Positions);
+        Vec3 delta = state.getPositions()[0]-state.getPositions()[1];
+        double dist = sqrt(delta.dot(delta));
+        if (dist < maxDist)
+            counts[(int) (numBins*dist/maxDist)]++;
+    }
+    vector<double> expected(numBins, 0);
+    double sum = 0;
+    for (int i = 0; i < numBins; i++) {
+        double dist = (i+0.5)*maxDist/numBins;
+        expected[i] = dist*dist*exp(-5.0*(dist-2)*(dist-2)/kT);
+        sum += expected[i];
+    }
+    for (int i = 0; i < numBins; i++)
+        ASSERT_USUALLY_EQUAL_TOL((double) counts[i]/numIterations, expected[i]/sum, 0.01);
+}
+
+/**
+ * Test the ComputeSum operation.
+ */
+void testSum() {
+    const int numParticles = 200;
+    const double boxSize = 10;
+    CudaPlatform platform;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    NonbondedForce* nb = new NonbondedForce();
+    system.addForce(nb);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.5);
+        nb->addParticle(i%2 == 0 ? 1 : -1, 0.1, 1);
+        bool close = true;
+        while (close) {
+            positions[i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+            close = false;
+            for (int j = 0; j < i; ++j) {
+                Vec3 delta = positions[i]-positions[j];
+                if (delta.dot(delta) < 0.1)
+                    close = true;
+            }
+        }
+    }
+    CustomIntegrator integrator(0.01);
+    integrator.addGlobalVariable("ke", 0);
+    integrator.addComputePerDof("v", "v+dt*f/m");
+    integrator.addComputePerDof("x", "x+dt*v");
+    integrator.addComputeSum("ke", "m*v*v/2");
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    
+    // See if the sum is being computed correctly.
+    
+    State state = context.getState(State::Energy);
+    const double initialEnergy = state.getKineticEnergy()+state.getPotentialEnergy();
+    for (int i = 0; i < 100; ++i) {
+        state = context.getState(State::Energy);
+        ASSERT_EQUAL_TOL(state.getKineticEnergy(), integrator.getGlobalVariable(0), 1e-5);
+        integrator.step(1);
+    }
+}
+
+/**
+ * Test an integrator that both uses and modifies a context parameter.
+ */
+void testParameter() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    AndersenThermostat* thermostat = new AndersenThermostat(0.1, 0.1);
+    system.addForce(thermostat);
+    CustomIntegrator integrator(0.1);
+    integrator.addGlobalVariable("temp", 0);
+    integrator.addComputeGlobal("temp", "AndersenTemperature");
+    integrator.addComputeGlobal("AndersenTemperature", "temp*2");
+    Context context(system, integrator, platform);
+    
+    // See if the parameter is being used correctly.
+    
+    for (int i = 0; i < 10; i++) {
+        integrator.step(1);
+        ASSERT_EQUAL_TOL(context.getParameter("AndersenTemperature"), 0.1*(1<<(i+1)), 1e-5);
+    }
+}
+
+/**
+ * Test random number distributions.
+ */
+void testRandomDistributions() {
+    const int numParticles = 100;
+    const int numBins = 20;
+    const int numSteps = 100;
+    CudaPlatform platform;
+    System system;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomIntegrator integrator(0.1);
+    integrator.addPerDofVariable("a", 0);
+    integrator.addPerDofVariable("b", 0);
+    integrator.addComputePerDof("a", "uniform");
+    integrator.addComputePerDof("b", "gaussian");
+    Context context(system, integrator, platform);
+    
+    // See if the random numbers are distributed correctly.
+    
+    vector<int> bins(numBins);
+    double mean = 0.0;
+    double var = 0.0;
+    double skew = 0.0;
+    double kurtosis = 0.0;
+    vector<Vec3> values;
+    for (int i = 0; i < numSteps; i++) {
+        integrator.step(1);
+        integrator.getPerDofVariable(0, values);
+        for (int i = 0; i < numParticles; i++)
+            for (int j = 0; j < 3; j++) {
+                double v = values[i][j];
+                ASSERT(v >= 0 && v < 1);
+                bins[(int) (v*numBins)]++;
+            }
+        integrator.getPerDofVariable(1, values);
+        for (int i = 0; i < numParticles; i++)
+            for (int j = 0; j < 3; j++) {
+                double v = values[i][j];
+                mean += v;
+                var += v*v;
+                skew += v*v*v;
+                kurtosis += v*v*v*v;
+            }
+    }
+    
+    // Check the distribution of uniform randoms.
+    
+    int numValues = numParticles*numSteps*3;
+    double expected = numValues/(double) numBins;
+    double tol = 4*sqrt(expected);
+    for (int i = 0; i < numBins; i++)
+        ASSERT(bins[i] >= expected-tol && bins[i] <= expected+tol);
+    
+    // Check the distribution of gaussian randoms.
+    
+    mean /= numValues;
+    var /= numValues;
+    skew /= numValues;
+    kurtosis /= numValues;
+    double c2 = var-mean*mean;
+    double c3 = skew-3*var*mean+2*mean*mean*mean;
+    double c4 = kurtosis-4*skew*mean-3*var*var+12*var*mean*mean-6*mean*mean*mean*mean;
+    ASSERT_EQUAL_TOL(0.0, mean, 3.0/sqrt((double) numValues));
+    ASSERT_EQUAL_TOL(1.0, c2, 3.0/pow(numValues, 1.0/3.0));
+    ASSERT_EQUAL_TOL(0.0, c3, 3.0/pow(numValues, 1.0/4.0));
+    ASSERT_EQUAL_TOL(0.0, c4, 3.0/pow(numValues, 1.0/4.0));
+}
+
+/**
+ * Test getting and setting per-DOF variables.
+ */
+void testPerDofVariables() {
+    const int numParticles = 200;
+    const double boxSize = 10;
+    CudaPlatform platform;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    NonbondedForce* nb = new NonbondedForce();
+    system.addForce(nb);
+    nb->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.5);
+        nb->addParticle(i%2 == 0 ? 1 : -1, 0.1, 1);
+        bool close = true;
+        while (close) {
+            positions[i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+            close = false;
+            for (int j = 0; j < i; ++j) {
+                Vec3 delta = positions[i]-positions[j];
+                if (delta.dot(delta) < 0.1)
+                    close = true;
+            }
+        }
+    }
+    CustomIntegrator integrator(0.01);
+    integrator.addPerDofVariable("temp", 0);
+    integrator.addPerDofVariable("pos", 0);
+    integrator.addComputePerDof("v", "v+dt*f/m");
+    integrator.addComputePerDof("x", "x+dt*v");
+    integrator.addComputePerDof("pos", "x");
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    vector<Vec3> initialValues(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        initialValues[i] = Vec3(i+0.1, i+0.2, i+0.3);
+    integrator.setPerDofVariable(0, initialValues);
+    
+    // Run a simulation, then query per-DOF values and see if they are correct.
+    
+    vector<Vec3> values;
+    context.getState(State::Forces); // Cause atom reordering to happen before the first step
+    for (int i = 0; i < 200; ++i) {
+        integrator.step(1);
+        State state = context.getState(State::Positions);
+        integrator.getPerDofVariable(0, values);
+        for (int j = 0; j < numParticles; j++)
+            ASSERT_EQUAL_VEC(initialValues[j], values[j], 1e-5);
+        integrator.getPerDofVariable(1, values);
+        for (int j = 0; j < numParticles; j++)
+            ASSERT_EQUAL_VEC(state.getPositions()[j], values[j], 1e-5);
+    }
+}
+
+/**
+ * Test evaluating force groups separately.
+ */
+void testForceGroups() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    system.addParticle(2.0);
+    CustomIntegrator integrator(0.01);
+    integrator.addPerDofVariable("outf", 0);
+    integrator.addPerDofVariable("outf1", 0);
+    integrator.addPerDofVariable("outf2", 0);
+    integrator.addGlobalVariable("oute", 0);
+    integrator.addGlobalVariable("oute1", 0);
+    integrator.addGlobalVariable("oute2", 0);
+    integrator.addComputePerDof("outf", "f");
+    integrator.addComputePerDof("outf1", "f1");
+    integrator.addComputePerDof("outf2", "f2");
+    integrator.addComputeGlobal("oute", "energy");
+    integrator.addComputeGlobal("oute1", "energy1");
+    integrator.addComputeGlobal("oute2", "energy2");
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    bonds->addBond(0, 1, 1.5, 1.1);
+    bonds->setForceGroup(1);
+    system.addForce(bonds);
+    NonbondedForce* nb = new NonbondedForce();
+    nb->addParticle(0.2, 1, 0);
+    nb->addParticle(0.2, 1, 0);
+    nb->setForceGroup(2);
+    system.addForce(nb);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    
+    // See if the various forces are computed correctly.
+    
+    integrator.step(1);
+    vector<Vec3> f, f1, f2;
+    double e1 = 0.5*1.1*0.5*0.5;
+    double e2 = 138.935456*0.2*0.2/2.0;
+    integrator.getPerDofVariable(0, f);
+    integrator.getPerDofVariable(1, f1);
+    integrator.getPerDofVariable(2, f2);
+    ASSERT_EQUAL_VEC(Vec3(1.1*0.5, 0, 0), f1[0], 1e-5);
+    ASSERT_EQUAL_VEC(Vec3(-1.1*0.5, 0, 0), f1[1], 1e-5);
+    ASSERT_EQUAL_VEC(Vec3(-138.935456*0.2*0.2/4.0, 0, 0), f2[0], 1e-5);
+    ASSERT_EQUAL_VEC(Vec3(138.935456*0.2*0.2/4.0, 0, 0), f2[1], 1e-5);
+    ASSERT_EQUAL_VEC(f1[0]+f2[0], f[0], 1e-5);
+    ASSERT_EQUAL_VEC(f1[1]+f2[1], f[1], 1e-5);
+    ASSERT_EQUAL_TOL(e1, integrator.getGlobalVariable(1), 1e-5);
+    ASSERT_EQUAL_TOL(e2, integrator.getGlobalVariable(2), 1e-5);
+    ASSERT_EQUAL_TOL(e1+e2, integrator.getGlobalVariable(0), 1e-5);
+    
+    // Make sure they also match the values returned by the Context.
+    
+    State s = context.getState(State::Forces | State::Energy, false);
+    State s1 = context.getState(State::Forces | State::Energy, false, 2);
+    State s2 = context.getState(State::Forces | State::Energy, false, 4);
+    vector<Vec3> c, c1, c2;
+    c = context.getState(State::Forces, false).getForces();
+    c1 = context.getState(State::Forces, false, 2).getForces();
+    c2 = context.getState(State::Forces, false, 4).getForces();
+    ASSERT_EQUAL_VEC(f[0], c[0], 1e-5);
+    ASSERT_EQUAL_VEC(f[1], c[1], 1e-5);
+    ASSERT_EQUAL_VEC(f1[0], c1[0], 1e-5);
+    ASSERT_EQUAL_VEC(f1[1], c1[1], 1e-5);
+    ASSERT_EQUAL_VEC(f2[0], c2[0], 1e-5);
+    ASSERT_EQUAL_VEC(f2[1], c2[1], 1e-5);
+    ASSERT_EQUAL_TOL(s.getPotentialEnergy(), integrator.getGlobalVariable(0), 1e-5);
+    ASSERT_EQUAL_TOL(s1.getPotentialEnergy(), integrator.getGlobalVariable(1), 1e-5);
+    ASSERT_EQUAL_TOL(s2.getPotentialEnergy(), integrator.getGlobalVariable(2), 1e-5);
+}
+
+/**
+ * Test a multiple time step r-RESPA integrator.
+ */
+void testRespa() {
+    const int numParticles = 8;
+    CudaPlatform platform;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(4, 0, 0), Vec3(0, 4, 0), Vec3(0, 0, 4));
+    CustomIntegrator integrator(0.002);
+    integrator.addComputePerDof("v", "v+0.5*dt*f1/m");
+    for (int i = 0; i < 2; i++) {
+        integrator.addComputePerDof("v", "v+0.5*(dt/2)*f0/m");
+        integrator.addComputePerDof("x", "x+(dt/2)*v");
+        integrator.addComputePerDof("v", "v+0.5*(dt/2)*f0/m");
+    }
+    integrator.addComputePerDof("v", "v+0.5*dt*f1/m");
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    for (int i = 0; i < numParticles-2; i++)
+        bonds->addBond(i, i+1, 1.0, 0.5);
+    system.addForce(bonds);
+    NonbondedForce* nb = new NonbondedForce();
+    nb->setCutoffDistance(2.0);
+    nb->setNonbondedMethod(NonbondedForce::Ewald);
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(i%2 == 0 ? 5.0 : 10.0);
+        nb->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    nb->setForceGroup(1);
+    nb->setReciprocalSpaceForceGroup(0);
+    system.addForce(nb);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3(i/2, (i+1)/2, 0);
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    
+    // Simulate it and monitor energy conservations.
+    
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Energy);
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        if (i == 1)
+            initialEnergy = energy;
+        else if (i > 1)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.05);
+        integrator.step(2);
+    }
+}
+
+int main() {
+    try {
+        testSingleBond();
+        testConstraints();
+        testVelocityConstraints();
+        testWithThermostat();
+        testMonteCarlo();
+        testSum();
+        testParameter();
+        testRandomDistributions();
+        testPerDofVariables();
+        testForceGroups();
+        testRespa();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests all the different force terms in the CUDA implementation of CustomNonbondedForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "sfmt/SFMT.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/CustomNonbondedForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void testSimpleExpression() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("-0.1*r^3");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    double force = 0.1*3*(2*2);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(-0.1*(2*2*2), state.getPotentialEnergy(), TOL);
+}
+
+void testParameters() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("scale*a*(r*b)^3; a=a1*a2; b=c+b1+b2");
+    forceField->addPerParticleParameter("a");
+    forceField->addPerParticleParameter("b");
+    forceField->addGlobalParameter("scale", 3.0);
+    forceField->addGlobalParameter("c", -1.0);
+    vector<double> params(2);
+    params[0] = 1.5;
+    params[1] = 2.0;
+    forceField->addParticle(params);
+    params[0] = 2.0;
+    params[1] = 3.0;
+    forceField->addParticle(params);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    context.setParameter("scale", 1.0);
+    context.setParameter("c", 0.0);
+    State state = context.getState(State::Forces | State::Energy);
+    vector<Vec3> forces = state.getForces();
+    double force = -3.0*3*5.0*(10*10);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(3.0*(10*10*10), state.getPotentialEnergy(), TOL);
+    
+    // Try changing the global parameters and make sure it's still correct.
+    
+    context.setParameter("scale", 1.5);
+    context.setParameter("c", 1.0);
+    state = context.getState(State::Forces | State::Energy);
+    forces = state.getForces();
+    force = -1.5*3.0*3*6.0*(12*12);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(1.5*3.0*(12*12*12), state.getPotentialEnergy(), TOL);
+    
+    // Try changing the per-particle parameters and make sure it's still correct.
+    
+    params[0] = 1.6;
+    params[1] = 2.1;
+    forceField->setParticleParameters(0, params);
+    params[0] = 1.9;
+    params[1] = 2.8;
+    forceField->setParticleParameters(1, params);
+    forceField->updateParametersInContext(context);
+    state = context.getState(State::Forces | State::Energy);
+    forces = state.getForces();
+    force = -1.5*1.6*1.9*3*5.9*(11.8*11.8);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(1.5*1.6*1.9*(11.8*11.8*11.8), state.getPotentialEnergy(), TOL);
+}
+
+void testManyParameters() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("(a1*a2+b1*b2+c1*c2+d1*d2+e1*e2)*r");
+    forceField->addPerParticleParameter("a");
+    forceField->addPerParticleParameter("b");
+    forceField->addPerParticleParameter("c");
+    forceField->addPerParticleParameter("d");
+    forceField->addPerParticleParameter("e");
+    vector<double> params(5);
+    params[0] = 1.0;
+    params[1] = 2.0;
+    params[2] = 3.0;
+    params[3] = 4.0;
+    params[4] = 5.0;
+    forceField->addParticle(params);
+    params[0] = 1.1;
+    params[1] = 1.2;
+    params[2] = 1.3;
+    params[3] = 1.4;
+    params[4] = 1.5;
+    forceField->addParticle(params);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    vector<Vec3> forces = state.getForces();
+    double force = 1*1.1 + 2*1.2 + 3*1.3 + 4*1.4 + 5*1.5;
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(2*force, state.getPotentialEnergy(), TOL);
+}
+
+void testExclusions() {
+    CudaPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* nonbonded = new CustomNonbondedForce("a*r; a=a1+a2");
+    nonbonded->addPerParticleParameter("a");
+    vector<double> params(1);
+    vector<Vec3> positions(4);
+    for (int i = 0; i < 4; i++) {
+        system.addParticle(1.0);
+        params[0] = i+1;
+        nonbonded->addParticle(params);
+        positions[i] = Vec3(i, 0, 0);
+    }
+    nonbonded->addExclusion(0, 1);
+    nonbonded->addExclusion(1, 2);
+    nonbonded->addExclusion(2, 3);
+    nonbonded->addExclusion(0, 2);
+    nonbonded->addExclusion(1, 3);
+    system.addForce(nonbonded);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(1+4, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-(1+4), 0, 0), forces[3], TOL);
+    ASSERT_EQUAL_TOL((1+4)*3.0, state.getPotentialEnergy(), TOL);
+}
+
+void testCutoff() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("r");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    forceField->setCutoffDistance(2.5);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(0, 3, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0, 1, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, -1, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(2.0+1.0, state.getPotentialEnergy(), TOL);
+}
+
+void testPeriodic() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("r");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
+    forceField->setCutoffDistance(2.0);
+    system.setDefaultPeriodicBoxVectors(Vec3(4, 0, 0), Vec3(0, 4, 0), Vec3(0, 0, 4));
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2.1, 0);
+    positions[2] = Vec3(0, 3, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0, -2, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 2, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.9+1+0.9, state.getPotentialEnergy(), TOL);
+}
+
+void testTabulatedFunction() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("fn(r)+1");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    vector<double> table;
+    for (int i = 0; i < 21; i++)
+        table.push_back(std::sin(0.25*i));
+    forceField->addFunction("fn", table, 1.0, 6.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    double tol = 0.01;
+    for (int i = 1; i < 30; i++) {
+        double x = (7.0/30.0)*i;
+        positions[1] = Vec3(x, 0, 0);
+        context.setPositions(positions);
+        State state = context.getState(State::Forces | State::Energy);
+        const vector<Vec3>& forces = state.getForces();
+        double force = (x < 1.0 || x > 6.0 ? 0.0 : -std::cos(x-1.0));
+        double energy = (x < 1.0 || x > 6.0 ? 0.0 : std::sin(x-1.0))+1.0;
+        ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], 0.1);
+        ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], 0.1);
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), 0.02);
+    }
+    for (int i = 1; i < 20; i++) {
+        double x = 0.25*i+1.0;
+        positions[1] = Vec3(x, 0, 0);
+        context.setPositions(positions);
+        State state = context.getState(State::Energy);
+        double energy = (x < 1.0 || x > 6.0 ? 0.0 : std::sin(x-1.0))+1.0;
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), 1e-4);
+    }
+}
+
+void testCoulombLennardJones() {
+    const int numMolecules = 300;
+    const int numParticles = numMolecules*2;
+    const double boxSize = 20.0;
+    CudaPlatform platform;
+
+    // Create two systems: one with a NonbondedForce, and one using a CustomNonbondedForce to implement the same interaction.
+
+    System standardSystem;
+    System customSystem;
+    for (int i = 0; i < numParticles; i++) {
+        standardSystem.addParticle(1.0);
+        customSystem.addParticle(1.0);
+    }
+    NonbondedForce* standardNonbonded = new NonbondedForce();
+    CustomNonbondedForce* customNonbonded = new CustomNonbondedForce("4*eps*((sigma/r)^12-(sigma/r)^6)+138.935456*q/r; q=q1*q2; sigma=0.5*(sigma1+sigma2); eps=sqrt(eps1*eps2)");
+    customNonbonded->addPerParticleParameter("q");
+    customNonbonded->addPerParticleParameter("sigma");
+    customNonbonded->addPerParticleParameter("eps");
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<double> params(3);
+    for (int i = 0; i < numMolecules; i++) {
+        if (i < numMolecules/2) {
+            standardNonbonded->addParticle(1.0, 0.2, 0.1);
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.1;
+            customNonbonded->addParticle(params);
+            standardNonbonded->addParticle(-1.0, 0.1, 0.1);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            customNonbonded->addParticle(params);
+        }
+        else {
+            standardNonbonded->addParticle(1.0, 0.2, 0.2);
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.2;
+            customNonbonded->addParticle(params);
+            standardNonbonded->addParticle(-1.0, 0.1, 0.2);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            customNonbonded->addParticle(params);
+        }
+        positions[2*i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+        positions[2*i+1] = Vec3(positions[2*i][0]+1.0, positions[2*i][1], positions[2*i][2]);
+        velocities[2*i] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        velocities[2*i+1] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        standardNonbonded->addException(2*i, 2*i+1, 0.0, 1.0, 0.0);
+        customNonbonded->addExclusion(2*i, 2*i+1);
+    }
+    standardNonbonded->setNonbondedMethod(NonbondedForce::NoCutoff);
+    customNonbonded->setNonbondedMethod(CustomNonbondedForce::NoCutoff);
+    standardSystem.addForce(standardNonbonded);
+    customSystem.addForce(customNonbonded);
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+    Context context1(standardSystem, integrator1, platform);
+    Context context2(customSystem, integrator2, platform);
+    context1.setPositions(positions);
+    context2.setPositions(positions);
+    context1.setVelocities(velocities);
+    context2.setVelocities(velocities);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-4);
+    for (int i = 0; i < numParticles; i++) {
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-4);
+    }
+}
+
+void testParallelComputation() {
+    CudaPlatform platform;
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomNonbondedForce* force = new CustomNonbondedForce("4*eps*((sigma/r)^12-(sigma/r)^6); sigma=0.5; eps=1");
+    vector<double> params;
+    for (int i = 0; i < numParticles; i++)
+        force->addParticle(params);
+    system.addForce(force);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(5*genrand_real2(sfmt), 5*genrand_real2(sfmt), 5*genrand_real2(sfmt));
+    for (int i = 0; i < numParticles; ++i)
+        for (int j = 0; j < i; ++j) {
+            Vec3 delta = positions[i]-positions[j];
+            if (delta.dot(delta) < 0.1)
+                force->addExclusion(i, j);
+        }
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, CudaPlatform::CudaDeviceIndex());
+    map<string, string> props;
+    props[CudaPlatform::CudaDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+int main() {
+    try {
+        testSimpleExpression();
+        testParameters();
+        testManyParameters();
+        testExclusions();
+        testCutoff();
+        testPeriodic();
+        testTabulatedFunction();
+        testCoulombLennardJones();
+//        testParallelComputation();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}