Continuing to implement new CUDA platform: CustomGBForce

f6346776 · Peter Eastman · 5feaa943 · f6346776 · f6346776 · f6346776
Commit f6346776 authored Jun 27, 2012 by Peter Eastman
13 changed files
--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -123,6 +123,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
    gpuArchitecture = intToString(major)+intToString(minor);
+    computeCapability = major+0.1*minor;
    defaultOptimizationOptions = "--use_fast_math";
    unsigned int flags = CU_CTX_MAP_HOST;
    if (useBlockingSync)

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -105,10 +105,16 @@ public:
    CUdevice getDevice() {
        return device;
    }
+    /**
+     * Get the compute capability of the device associated with this object.
+     */
+    double getComputeCapability() const {
+        return computeCapability;
+    }
    /**
     * Get the index of the CUdevice associated with this object.
     */
-    int getDeviceIndex() {
+    int getDeviceIndex() const {
        return deviceIndex;
    }
    /**
@@ -444,7 +450,7 @@ private:
    void validateMolecules();
    static bool hasInitializedCuda;
    const System& system;
-    double time;
+    double time, computeCapability;
    CudaPlatform::PlatformData& platformData;
    int deviceIndex;
    int contextIndex;

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -98,8 +98,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcGBSAOBCForceKernel::Name())
        return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
-//    if (name == CalcCustomGBForceKernel::Name())
+    if (name == CalcCustomGBForceKernel::Name())
-//        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
+        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomExternalForceKernel::Name())
        return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomHbondForceKernel::Name())

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -2068,940 +2068,835 @@ void CudaCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, c
    cu.invalidateMolecules();
 }
-//class CudaCustomGBForceInfo : public CudaForceInfo {
+class CudaCustomGBForceInfo : public CudaForceInfo {
-//public:
+public:
-//    CudaCustomGBForceInfo(int requiredBuffers, const CustomGBForce& force) : CudaForceInfo(requiredBuffers), force(force) {
+    CudaCustomGBForceInfo(const CustomGBForce& force) : force(force) {
-//    }
+    }
-//    bool areParticlesIdentical(int particle1, int particle2) {
+    bool areParticlesIdentical(int particle1, int particle2) {
-//        vector<double> params1;
+        vector<double> params1;
-//        vector<double> params2;
+        vector<double> params2;
-//        force.getParticleParameters(particle1, params1);
+        force.getParticleParameters(particle1, params1);
-//        force.getParticleParameters(particle2, params2);
+        force.getParticleParameters(particle2, params2);
-//        for (int i = 0; i < (int) params1.size(); i++)
+        for (int i = 0; i < (int) params1.size(); i++)
-//            if (params1[i] != params2[i])
+            if (params1[i] != params2[i])
-//                return false;
+                return false;
-//        return true;
+        return true;
-//    }
+    }
-//    int getNumParticleGroups() {
+    int getNumParticleGroups() {
-//        return force.getNumExclusions();
+        return force.getNumExclusions();
-//    }
+    }
-//    void getParticlesInGroup(int index, vector<int>& particles) {
+    void getParticlesInGroup(int index, vector<int>& particles) {
-//        int particle1, particle2;
+        int particle1, particle2;
-//        force.getExclusionParticles(index, particle1, particle2);
+        force.getExclusionParticles(index, particle1, particle2);
-//        particles.resize(2);
+        particles.resize(2);
-//        particles[0] = particle1;
+        particles[0] = particle1;
-//        particles[1] = particle2;
+        particles[1] = particle2;
-//    }
+    }
-//    bool areGroupsIdentical(int group1, int group2) {
+    bool areGroupsIdentical(int group1, int group2) {
-//        return true;
+        return true;
-//    }
+    }
-//private:
+private:
-//    const CustomGBForce& force;
+    const CustomGBForce& force;
-//};
+};
-//
-//CudaCalcCustomGBForceKernel::~CudaCalcCustomGBForceKernel() {
+CudaCalcCustomGBForceKernel::~CudaCalcCustomGBForceKernel() {
-//    cu.setAsCurrent();
+    cu.setAsCurrent();
-//    if (params != NULL)
+    if (params != NULL)
-//        delete params;
+        delete params;
-//    if (computedValues != NULL)
+    if (computedValues != NULL)
-//        delete computedValues;
+        delete computedValues;
-//    if (energyDerivs != NULL)
+    if (energyDerivs != NULL)
-//        delete energyDerivs;
+        delete energyDerivs;
-//    if (longEnergyDerivs != NULL)
+    if (longEnergyDerivs != NULL)
-//        delete longEnergyDerivs;
+        delete longEnergyDerivs;
-//    if (globals != NULL)
+    if (globals != NULL)
-//        delete globals;
+        delete globals;
-//    if (valueBuffers != NULL)
+    if (valueBuffers != NULL)
-//        delete valueBuffers;
+        delete valueBuffers;
-//    if (longValueBuffers != NULL)
+    if (tabulatedFunctionParams != NULL)
-//        delete longValueBuffers;
+        delete tabulatedFunctionParams;
-//    if (tabulatedFunctionParams != NULL)
+    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//        delete tabulatedFunctionParams;
+        delete tabulatedFunctions[i];
-//    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+}
-//        delete tabulatedFunctions[i];
-//}
+void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomGBForce& force) {
-//
+    cu.setAsCurrent();
-//void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomGBForce& force) {
+    if (cu.getPlatformData().contexts.size() > 1)
-//    cu.setAsCurrent();
+        throw OpenMMException("CustomGBForce does not support using multiple CUDA devices");
-//    if (cu.getPlatformData().contexts.size() > 1)
+    bool useExclusionsForValue = false;
-//        throw OpenMMException("CustomGBForce does not support using multiple CUDA devices");
+    numComputedValues = force.getNumComputedValues();
-//    bool useExclusionsForValue = false;
+    vector<string> computedValueNames(force.getNumComputedValues());
-//    numComputedValues = force.getNumComputedValues();
+    vector<string> computedValueExpressions(force.getNumComputedValues());
-//    vector<string> computedValueNames(force.getNumComputedValues());
+    if (force.getNumComputedValues() > 0) {
-//    vector<string> computedValueExpressions(force.getNumComputedValues());
+        CustomGBForce::ComputationType type;
-//    if (force.getNumComputedValues() > 0) {
+        force.getComputedValueParameters(0, computedValueNames[0], computedValueExpressions[0], type);
-//        CustomGBForce::ComputationType type;
+        if (type == CustomGBForce::SingleParticle)
-//        force.getComputedValueParameters(0, computedValueNames[0], computedValueExpressions[0], type);
+            throw OpenMMException("CudaPlatform requires that the first computed value for a CustomGBForce be of type ParticlePair or ParticlePairNoExclusions.");
-//        if (type == CustomGBForce::SingleParticle)
+        useExclusionsForValue = (type == CustomGBForce::ParticlePair);
-//            throw OpenMMException("CudaPlatform requires that the first computed value for a CustomGBForce be of type ParticlePair or ParticlePairNoExclusions.");
+        for (int i = 1; i < force.getNumComputedValues(); i++) {
-//        useExclusionsForValue = (type == CustomGBForce::ParticlePair);
+            force.getComputedValueParameters(i, computedValueNames[i], computedValueExpressions[i], type);
-//        for (int i = 1; i < force.getNumComputedValues(); i++) {
+            if (type != CustomGBForce::SingleParticle)
-//            force.getComputedValueParameters(i, computedValueNames[i], computedValueExpressions[i], type);
+                throw OpenMMException("CudaPlatform requires that a CustomGBForce only have one computed value of type ParticlePair or ParticlePairNoExclusions.");
-//            if (type != CustomGBForce::SingleParticle)
+        }
-//                throw OpenMMException("CudaPlatform requires that a CustomGBForce only have one computed value of type ParticlePair or ParticlePairNoExclusions.");
+    }
-//        }
+    int forceIndex;
-//    }
+    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
-//    int forceIndex;
+        ;
-//    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
+    string prefix = "custom"+cu.intToString(forceIndex)+"_";
-//        ;
-//    string prefix = "custom"+cu.intToString(forceIndex)+"_";
+    // Record parameters and exclusions.
-//
-//    // Record parameters and exclusions.
+    int numParticles = force.getNumParticles();
-//
+    params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), numParticles, "customGBParameters", true);
-//    int numParticles = force.getNumParticles();
+    computedValues = new CudaParameterSet(cu, force.getNumComputedValues(), numParticles, "customGBComputedValues", true);
-//    params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), numParticles, "customGBParameters", true);
+    if (force.getNumGlobalParameters() > 0)
-//    computedValues = new CudaParameterSet(cu, force.getNumComputedValues(), numParticles, "customGBComputedValues", true);
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customGBGlobals");
-//    if (force.getNumGlobalParameters() > 0)
+    vector<vector<float> > paramVector(numParticles);
-//        globals = new CudaArray<cl_float>(cu, force.getNumGlobalParameters(), "customGBGlobals", false, CL_MEM_READ_ONLY);
+    vector<vector<int> > exclusionList(numParticles);
-//    vector<vector<cl_float> > paramVector(numParticles);
+    for (int i = 0; i < numParticles; i++) {
-//    vector<vector<int> > exclusionList(numParticles);
+        vector<double> parameters;
-//    for (int i = 0; i < numParticles; i++) {
+        force.getParticleParameters(i, parameters);
-//        vector<double> parameters;
+        paramVector[i].resize(parameters.size());
-//        force.getParticleParameters(i, parameters);
+        for (int j = 0; j < (int) parameters.size(); j++)
-//        paramVector[i].resize(parameters.size());
+            paramVector[i][j] = (float) parameters[j];
-//        for (int j = 0; j < (int) parameters.size(); j++)
+        exclusionList[i].push_back(i);
-//            paramVector[i][j] = (cl_float) parameters[j];
+    }
-//        exclusionList[i].push_back(i);
+    for (int i = 0; i < force.getNumExclusions(); i++) {
-//    }
+        int particle1, particle2;
-//    for (int i = 0; i < force.getNumExclusions(); i++) {
+        force.getExclusionParticles(i, particle1, particle2);
-//        int particle1, particle2;
+        exclusionList[particle1].push_back(particle2);
-//        force.getExclusionParticles(i, particle1, particle2);
+        exclusionList[particle2].push_back(particle1);
-//        exclusionList[particle1].push_back(particle2);
+    }
-//        exclusionList[particle2].push_back(particle1);
+    params->setParameterValues(paramVector);
-//    }
-//    params->setParameterValues(paramVector);
+    // Record the tabulated functions.
-//
-//    // Record the tabulated functions.
+    CudaExpressionUtilities::FunctionPlaceholder fp;
-//
+    map<string, Lepton::CustomFunction*> functions;
-//    CudaExpressionUtilities::FunctionPlaceholder fp;
+    vector<pair<string, string> > functionDefinitions;
-//    map<string, Lepton::CustomFunction*> functions;
+    vector<float4> tabulatedFunctionParamsVec(force.getNumFunctions());
-//    vector<pair<string, string> > functionDefinitions;
+    stringstream tableArgs;
-//    vector<mm_float4> tabulatedFunctionParamsVec(force.getNumFunctions());
+    for (int i = 0; i < force.getNumFunctions(); i++) {
-//    stringstream tableArgs;
+        string name;
-//    for (int i = 0; i < force.getNumFunctions(); i++) {
+        vector<double> values;
-//        string name;
+        double min, max;
-//        vector<double> values;
+        force.getFunctionParameters(i, name, values, min, max);
-//        double min, max;
+        string arrayName = prefix+"table"+cu.intToString(i);
-//        force.getFunctionParameters(i, name, values, min, max);
+        functionDefinitions.push_back(make_pair(name, arrayName));
-//        string arrayName = prefix+"table"+cu.intToString(i);
+        functions[name] = &fp;
-//        functionDefinitions.push_back(make_pair(name, arrayName));
+        tabulatedFunctionParamsVec[i] = make_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
-//        functions[name] = &fp;
+        vector<float4> f = cu.getExpressionUtilities().computeFunctionCoefficients(values, min, max);
-//        tabulatedFunctionParamsVec[i] = mm_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
+        tabulatedFunctions.push_back(CudaArray::create<float4>(cu, values.size()-1, "TabulatedFunction"));
-//        vector<mm_float4> f = cu.getExpressionUtilities().computeFunctionCoefficients(values, min, max);
+        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
-//        tabulatedFunctions.push_back(new CudaArray<mm_float4>(cu, values.size()-1, "TabulatedFunction"));
+        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDevicePointer()));
-//        tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
+        tableArgs << ", const float4* __restrict__ " << arrayName;
-//        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDevicePointer()));
+    }
-//        tableArgs << ", __global const float4* restrict " << arrayName;
+    if (force.getNumFunctions() > 0) {
-//    }
+        tabulatedFunctionParams = CudaArray::create<float4>(cu, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters");
-//    if (force.getNumFunctions() > 0) {
+        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
-//        tabulatedFunctionParams = new CudaArray<mm_float4>(cu, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
+        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(float4), tabulatedFunctionParams->getDevicePointer()));
-//        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
+        tableArgs << ", const float4* " << prefix << "functionParams";
-//        cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(cl_float4), tabulatedFunctionParams->getDevicePointer()));
+    }
-//        tableArgs << ", __global const float4* " << prefix << "functionParams";
-//    }
+    // Record the global parameters.
-//
-//    // Record the global parameters.
+    globalParamNames.resize(force.getNumGlobalParameters());
-//
+    globalParamValues.resize(force.getNumGlobalParameters());
-//    globalParamNames.resize(force.getNumGlobalParameters());
+    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-//    globalParamValues.resize(force.getNumGlobalParameters());
+        globalParamNames[i] = force.getGlobalParameterName(i);
-//    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
-//        globalParamNames[i] = force.getGlobalParameterName(i);
+    }
-//        globalParamValues[i] = (cl_float) force.getGlobalParameterDefaultValue(i);
+    if (globals != NULL)
-//    }
+        globals->upload(globalParamValues);
-//    if (globals != NULL)
-//        globals->upload(globalParamValues);
+    // Record derivatives of expressions needed for the chain rule terms.
-//
-//    // Record derivatives of expressions needed for the chain rule terms.
+    vector<vector<Lepton::ParsedExpression> > valueGradientExpressions(force.getNumComputedValues());
-//
+    vector<vector<Lepton::ParsedExpression> > valueDerivExpressions(force.getNumComputedValues());
-//    vector<vector<Lepton::ParsedExpression> > valueGradientExpressions(force.getNumComputedValues());
+    needParameterGradient = false;
-//    vector<vector<Lepton::ParsedExpression> > valueDerivExpressions(force.getNumComputedValues());
+    for (int i = 1; i < force.getNumComputedValues(); i++) {
-//    needParameterGradient = false;
+        Lepton::ParsedExpression ex = Lepton::Parser::parse(computedValueExpressions[i], functions).optimize();
-//    for (int i = 1; i < force.getNumComputedValues(); i++) {
+        valueGradientExpressions[i].push_back(ex.differentiate("x").optimize());
-//        Lepton::ParsedExpression ex = Lepton::Parser::parse(computedValueExpressions[i], functions).optimize();
+        valueGradientExpressions[i].push_back(ex.differentiate("y").optimize());
-//        valueGradientExpressions[i].push_back(ex.differentiate("x").optimize());
+        valueGradientExpressions[i].push_back(ex.differentiate("z").optimize());
-//        valueGradientExpressions[i].push_back(ex.differentiate("y").optimize());
+        if (!isZeroExpression(valueGradientExpressions[i][0]) || !isZeroExpression(valueGradientExpressions[i][1]) || !isZeroExpression(valueGradientExpressions[i][2]))
-//        valueGradientExpressions[i].push_back(ex.differentiate("z").optimize());
+            needParameterGradient = true;
-//        if (!isZeroExpression(valueGradientExpressions[i][0]) || !isZeroExpression(valueGradientExpressions[i][1]) || !isZeroExpression(valueGradientExpressions[i][2]))
+         for (int j = 0; j < i; j++)
-//            needParameterGradient = true;
+            valueDerivExpressions[i].push_back(ex.differentiate(computedValueNames[j]).optimize());
-//         for (int j = 0; j < i; j++)
+    }
-//            valueDerivExpressions[i].push_back(ex.differentiate(computedValueNames[j]).optimize());
+    vector<vector<Lepton::ParsedExpression> > energyDerivExpressions(force.getNumEnergyTerms());
-//    }
+    vector<bool> needChainForValue(force.getNumComputedValues(), false);
-//    vector<vector<Lepton::ParsedExpression> > energyDerivExpressions(force.getNumEnergyTerms());
+    for (int i = 0; i < force.getNumEnergyTerms(); i++) {
-//    vector<bool> needChainForValue(force.getNumComputedValues(), false);
+        string expression;
-//    for (int i = 0; i < force.getNumEnergyTerms(); i++) {
+        CustomGBForce::ComputationType type;
-//        string expression;
+        force.getEnergyTermParameters(i, expression, type);
-//        CustomGBForce::ComputationType type;
+        Lepton::ParsedExpression ex = Lepton::Parser::parse(expression, functions).optimize();
-//        force.getEnergyTermParameters(i, expression, type);
+        for (int j = 0; j < force.getNumComputedValues(); j++) {
-//        Lepton::ParsedExpression ex = Lepton::Parser::parse(expression, functions).optimize();
+            if (type == CustomGBForce::SingleParticle) {
-//        for (int j = 0; j < force.getNumComputedValues(); j++) {
+                energyDerivExpressions[i].push_back(ex.differentiate(computedValueNames[j]).optimize());
-//            if (type == CustomGBForce::SingleParticle) {
+                if (!isZeroExpression(energyDerivExpressions[i].back()))
-//                energyDerivExpressions[i].push_back(ex.differentiate(computedValueNames[j]).optimize());
+                    needChainForValue[j] = true;
-//                if (!isZeroExpression(energyDerivExpressions[i].back()))
+            }
-//                    needChainForValue[j] = true;
+            else {
-//            }
+                energyDerivExpressions[i].push_back(ex.differentiate(computedValueNames[j]+"1").optimize());
-//            else {
+                if (!isZeroExpression(energyDerivExpressions[i].back()))
-//                energyDerivExpressions[i].push_back(ex.differentiate(computedValueNames[j]+"1").optimize());
+                    needChainForValue[j] = true;
-//                if (!isZeroExpression(energyDerivExpressions[i].back()))
+                energyDerivExpressions[i].push_back(ex.differentiate(computedValueNames[j]+"2").optimize());
-//                    needChainForValue[j] = true;
+                if (!isZeroExpression(energyDerivExpressions[i].back()))
-//                energyDerivExpressions[i].push_back(ex.differentiate(computedValueNames[j]+"2").optimize());
+                    needChainForValue[j] = true;
-//                if (!isZeroExpression(energyDerivExpressions[i].back()))
+            }
-//                    needChainForValue[j] = true;
+        }
-//            }
+    }
-//        }
+    longEnergyDerivs = CudaArray::create<long long>(cu, force.getNumComputedValues()*cu.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
-//    }
+    energyDerivs = new CudaParameterSet(cu, force.getNumComputedValues(), cu.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
-//    bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
-//    bool useLong = (cu.getSupports64BitGlobalAtomics() && !deviceIsCpu);
+    // Create the kernels.
-//    if (useLong) {
-//        longEnergyDerivs = new CudaArray<cl_long>(cu, force.getNumComputedValues()*cu.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
+    bool useCutoff = (force.getNonbondedMethod() != CustomGBForce::NoCutoff);
-//        energyDerivs = new CudaParameterSet(cu, force.getNumComputedValues(), cu.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
+    bool usePeriodic = (force.getNonbondedMethod() != CustomGBForce::NoCutoff && force.getNonbondedMethod() != CustomGBForce::CutoffNonPeriodic);
-//    }
+    {
-//    else
+        // Create the N2 value kernel.
-//        energyDerivs = new CudaParameterSet(cu, force.getNumComputedValues(), cu.getPaddedNumAtoms()*cu.getNonbondedUtilities().getNumForceBuffers(), "customGBEnergyDerivatives", true);
-// 
+        vector<pair<ExpressionTreeNode, string> > variables;
-//    // Create the kernels.
+        map<string, string> rename;
-//
+        ExpressionTreeNode rnode(new Operation::Variable("r"));
-//    bool useCutoff = (force.getNonbondedMethod() != CustomGBForce::NoCutoff);
+        variables.push_back(make_pair(rnode, "r"));
-//    bool usePeriodic = (force.getNonbondedMethod() != CustomGBForce::NoCutoff && force.getNonbondedMethod() != CustomGBForce::CutoffNonPeriodic);
+        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Square(), rnode), "r2"));
-//    {
+        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Reciprocal(), rnode), "invR"));
-//        // Create the N2 value kernel.
+        for (int i = 0; i < force.getNumPerParticleParameters(); i++) {
-//
+            const string& name = force.getPerParticleParameterName(i);
-//        vector<pair<ExpressionTreeNode, string> > variables;
+            variables.push_back(makeVariable(name+"1", "params"+params->getParameterSuffix(i, "1")));
-//        map<string, string> rename;
+            variables.push_back(makeVariable(name+"2", "params"+params->getParameterSuffix(i, "2")));
-//        ExpressionTreeNode rnode(new Operation::Variable("r"));
+            rename[name+"1"] = name+"2";
-//        variables.push_back(make_pair(rnode, "r"));
+            rename[name+"2"] = name+"1";
-//        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Square(), rnode), "r2"));
+        }
-//        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Reciprocal(), rnode), "invR"));
+        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-//        for (int i = 0; i < force.getNumPerParticleParameters(); i++) {
+            const string& name = force.getGlobalParameterName(i);
-//            const string& name = force.getPerParticleParameterName(i);
+            string value = "globals["+cu.intToString(i)+"]";
-//            variables.push_back(makeVariable(name+"1", "params"+params->getParameterSuffix(i, "1")));
+            variables.push_back(makeVariable(name, value));
-//            variables.push_back(makeVariable(name+"2", "params"+params->getParameterSuffix(i, "2")));
+        }
-//            rename[name+"1"] = name+"2";
+        map<string, Lepton::ParsedExpression> n2ValueExpressions;
-//            rename[name+"2"] = name+"1";
+        stringstream n2ValueSource;
-//        }
+        Lepton::ParsedExpression ex = Lepton::Parser::parse(computedValueExpressions[0], functions).optimize();
-//        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        n2ValueExpressions["tempValue1 = "] = ex;
-//            const string& name = force.getGlobalParameterName(i);
+        n2ValueExpressions["tempValue2 = "] = ex.renameVariables(rename);
-//            string value = "globals["+cu.intToString(i)+"]";
+        n2ValueSource << cu.getExpressionUtilities().createExpressions(n2ValueExpressions, variables, functionDefinitions, "temp", prefix+"functionParams");
-//            variables.push_back(makeVariable(name, value));
+        map<string, string> replacements;
-//        }
+        string n2ValueStr = n2ValueSource.str();
-//        map<string, Lepton::ParsedExpression> n2ValueExpressions;
+        replacements["COMPUTE_VALUE"] = n2ValueStr;
-//        stringstream n2ValueSource;
+        stringstream extraArgs, atomParams, loadLocal1, loadLocal2, load1, load2;
-//        Lepton::ParsedExpression ex = Lepton::Parser::parse(computedValueExpressions[0], functions).optimize();
+        if (force.getNumGlobalParameters() > 0)
-//        n2ValueExpressions["tempValue1 = "] = ex;
+            extraArgs << ", const float* globals";
-//        n2ValueExpressions["tempValue2 = "] = ex.renameVariables(rename);
+        pairValueUsesParam.resize(params->getBuffers().size(), false);
-//        n2ValueSource << cu.getExpressionUtilities().createExpressions(n2ValueExpressions, variables, functionDefinitions, "temp", prefix+"functionParams");
+        int atomParamSize = 6;
-//        map<string, string> replacements;
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//        string n2ValueStr = n2ValueSource.str();
+            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
-//        replacements["COMPUTE_VALUE"] = n2ValueStr;
+            string paramName = "params"+cu.intToString(i+1);
-//        stringstream extraArgs, loadLocal1, loadLocal2, load1, load2;
+            if (n2ValueStr.find(paramName+"1") != n2ValueStr.npos || n2ValueStr.find(paramName+"2") != n2ValueStr.npos) {
-//        if (force.getNumGlobalParameters() > 0)
+                extraArgs << ", const " << buffer.getType() << "* __restrict__ global_" << paramName;
-//            extraArgs << ", __global const float* globals";
+                atomParams << buffer.getType() << " " << paramName << ";\n";
-//        pairValueUsesParam.resize(params->getBuffers().size(), false);
+                loadLocal1 << "localData[localAtomIndex]." << paramName << " = " << paramName << "1;\n";
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+                loadLocal2 << "localData[localAtomIndex]." << paramName << " = global_" << paramName << "[j];\n";
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+                load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
-//            string paramName = "params"+cu.intToString(i+1);
+                load2 << buffer.getType() << " " << paramName << "2 = localData[atom2]." << paramName << ";\n";
-//            if (n2ValueStr.find(paramName+"1") != n2ValueStr.npos || n2ValueStr.find(paramName+"2") != n2ValueStr.npos) {
+                pairValueUsesParam[i] = true;
-//                extraArgs << ", __global const " << buffer.getType() << "* restrict global_" << paramName << ", __local " << buffer.getType() << "* restrict local_" << paramName;
+                atomParamSize += buffer.getNumComponents();
-//                loadLocal1 << "local_" << paramName << "[localAtomIndex] = " << paramName << "1;\n";
+            }
-//                loadLocal2 << "local_" << paramName << "[localAtomIndex] = global_" << paramName << "[j];\n";
+        }
-//                load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
+        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
-//                load2 << buffer.getType() << " " << paramName << "2 = local_" << paramName << "[atom2];\n";
+        replacements["ATOM_PARAMETER_DATA"] = atomParams.str();
-//                pairValueUsesParam[i] = true;
+        replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
-//            }
+        replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
-//        }
+        replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
-//        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
+        replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
-//        replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
+        map<string, string> defines;
-//        replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
+        if (useCutoff)
-//        replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+            defines["USE_CUTOFF"] = "1";
-//        replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
+        if (usePeriodic)
-//        map<string, string> defines;
+            defines["USE_PERIODIC"] = "1";
-//        if (cu.getNonbondedUtilities().getForceBufferPerAtomBlock())
+        if (useExclusionsForValue)
-//            defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
+            defines["USE_EXCLUSIONS"] = "1";
-//        if (useCutoff)
+        if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
-//            defines["USE_CUTOFF"] = "1";
+            defines["NEED_PADDING"] = "1";
-//        if (usePeriodic)
+        defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
-//            defines["USE_PERIODIC"] = "1";
+        defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
-//        if (useExclusionsForValue)
+        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
-//            defines["USE_EXCLUSIONS"] = "1";
+        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//        if (cu.getSIMDWidth() == 32)
+        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//            defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
+        defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
-//        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBValueN2, replacements), defines);
-//        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+        pairValueKernel = cu.getKernel(module, "computeN2Value");
-//        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+        if (useExclusionsForValue)
-//        defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+            cu.getNonbondedUtilities().requestExclusions(exclusionList);
-//        string file;
+    }
-//        if (deviceIsCpu)
+    {
-//            file = CudaKernelSources::customGBValueN2_cpu;
+        // Create the kernel to reduce the N2 value and calculate other values.
-//        else if (cu.getSIMDWidth() == 32)
-//            file = CudaKernelSources::customGBValueN2_nvidia;
+        stringstream reductionSource, extraArgs;
-//        else
+        if (force.getNumGlobalParameters() > 0)
-//            file = CudaKernelSources::customGBValueN2_default;
+            extraArgs << ", const float* globals";
-//        CUmodule module = cu.createModule(cu.replaceStrings(file, replacements), defines);
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//        pairValueKernel = cu.getKernel(module, "computeN2Value");
+            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
-//        if (useExclusionsForValue)
+            string paramName = "params"+cu.intToString(i+1);
-//            cu.getNonbondedUtilities().requestExclusions(exclusionList);
+            extraArgs << ", const " << buffer.getType() << "* __restrict__ " << paramName;
-//    }
+        }
-//    {
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
-//        // Create the kernel to reduce the N2 value and calculate other values.
+            CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
-//
+            string valueName = "values"+cu.intToString(i+1);
-//        stringstream reductionSource, extraArgs;
+            extraArgs << ", " << buffer.getType() << "* __restrict__ global_" << valueName;
-//        if (force.getNumGlobalParameters() > 0)
+            reductionSource << buffer.getType() << " local_" << valueName << ";\n";
-//            extraArgs << ", __global const float* globals";
+        }
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+        reductionSource << "local_values" << computedValues->getParameterSuffix(0) << " = sum;\n";
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        map<string, string> variables;
-//            string paramName = "params"+cu.intToString(i+1);
+        variables["x"] = "pos.x";
-//            extraArgs << ", __global const " << buffer.getType() << "* restrict " << paramName;
+        variables["y"] = "pos.y";
-//        }
+        variables["z"] = "pos.z";
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
+        for (int i = 0; i < force.getNumPerParticleParameters(); i++)
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
+            variables[force.getPerParticleParameterName(i)] = "params"+params->getParameterSuffix(i, "[index]");
-//            string valueName = "values"+cu.intToString(i+1);
+        for (int i = 0; i < force.getNumGlobalParameters(); i++)
-//            extraArgs << ", __global " << buffer.getType() << "* restrict global_" << valueName;
+            variables[force.getGlobalParameterName(i)] = "globals["+cu.intToString(i)+"]";
-//            reductionSource << buffer.getType() << " local_" << valueName << ";\n";
+        for (int i = 1; i < force.getNumComputedValues(); i++) {
-//        }
+            variables[computedValueNames[i-1]] = "local_values"+computedValues->getParameterSuffix(i-1);
-//        reductionSource << "local_values" << computedValues->getParameterSuffix(0) << " = sum;\n";
+            map<string, Lepton::ParsedExpression> valueExpressions;
-//        map<string, string> variables;
+            valueExpressions["local_values"+computedValues->getParameterSuffix(i)+" = "] = Lepton::Parser::parse(computedValueExpressions[i], functions).optimize();
-//        variables["x"] = "pos.x";
+            reductionSource << cu.getExpressionUtilities().createExpressions(valueExpressions, variables, functionDefinitions, "value"+cu.intToString(i)+"_temp", prefix+"functionParams");
-//        variables["y"] = "pos.y";
+        }
-//        variables["z"] = "pos.z";
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
-//        for (int i = 0; i < force.getNumPerParticleParameters(); i++)
+            string valueName = "values"+cu.intToString(i+1);
-//            variables[force.getPerParticleParameterName(i)] = "params"+params->getParameterSuffix(i, "[index]");
+            reductionSource << "global_" << valueName << "[index] = local_" << valueName << ";\n";
-//        for (int i = 0; i < force.getNumGlobalParameters(); i++)
+        }
-//            variables[force.getGlobalParameterName(i)] = "globals["+cu.intToString(i)+"]";
+        map<string, string> replacements;
-//        for (int i = 1; i < force.getNumComputedValues(); i++) {
+        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
-//            variables[computedValueNames[i-1]] = "local_values"+computedValues->getParameterSuffix(i-1);
+        replacements["COMPUTE_VALUES"] = reductionSource.str();
-//            map<string, Lepton::ParsedExpression> valueExpressions;
+        map<string, string> defines;
-//            valueExpressions["local_values"+computedValues->getParameterSuffix(i)+" = "] = Lepton::Parser::parse(computedValueExpressions[i], functions).optimize();
+        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//            reductionSource << cu.getExpressionUtilities().createExpressions(valueExpressions, variables, functionDefinitions, "value"+cu.intToString(i)+"_temp", prefix+"functionParams");
+        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBValuePerParticle, replacements), defines);
-//        }
+        perParticleValueKernel = cu.getKernel(module, "computePerParticleValues");
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
+    }
-//            string valueName = "values"+cu.intToString(i+1);
+    {
-//            reductionSource << "global_" << valueName << "[index] = local_" << valueName << ";\n";
+        // Create the N2 energy kernel.
-//        }
-//        map<string, string> replacements;
+        vector<pair<ExpressionTreeNode, string> > variables;
-//        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
+        ExpressionTreeNode rnode(new Operation::Variable("r"));
-//        replacements["COMPUTE_VALUES"] = reductionSource.str();
+        variables.push_back(make_pair(rnode, "r"));
-//        map<string, string> defines;
+        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Square(), rnode), "r2"));
-//        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Reciprocal(), rnode), "invR"));
-//        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBValuePerParticle, replacements), defines);
+        for (int i = 0; i < force.getNumPerParticleParameters(); i++) {
-//        perParticleValueKernel = cu.getKernel(module, "computePerParticleValues");
+            const string& name = force.getPerParticleParameterName(i);
-//    }
+            variables.push_back(makeVariable(name+"1", "params"+params->getParameterSuffix(i, "1")));
-//    {
+            variables.push_back(makeVariable(name+"2", "params"+params->getParameterSuffix(i, "2")));
-//        // Create the N2 energy kernel.
+        }
-//
+        for (int i = 0; i < force.getNumComputedValues(); i++) {
-//        vector<pair<ExpressionTreeNode, string> > variables;
+            variables.push_back(makeVariable(computedValueNames[i]+"1", "values"+computedValues->getParameterSuffix(i, "1")));
-//        ExpressionTreeNode rnode(new Operation::Variable("r"));
+            variables.push_back(makeVariable(computedValueNames[i]+"2", "values"+computedValues->getParameterSuffix(i, "2")));
-//        variables.push_back(make_pair(rnode, "r"));
+        }
-//        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Square(), rnode), "r2"));
+        for (int i = 0; i < force.getNumGlobalParameters(); i++)
-//        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Reciprocal(), rnode), "invR"));
+            variables.push_back(makeVariable(force.getGlobalParameterName(i), "globals["+cu.intToString(i)+"]"));
-//        for (int i = 0; i < force.getNumPerParticleParameters(); i++) {
+        stringstream n2EnergySource;
-//            const string& name = force.getPerParticleParameterName(i);
+        bool anyExclusions = (force.getNumExclusions() > 0);
-//            variables.push_back(makeVariable(name+"1", "params"+params->getParameterSuffix(i, "1")));
+        for (int i = 0; i < force.getNumEnergyTerms(); i++) {
-//            variables.push_back(makeVariable(name+"2", "params"+params->getParameterSuffix(i, "2")));
+            string expression;
-//        }
+            CustomGBForce::ComputationType type;
-//        for (int i = 0; i < force.getNumComputedValues(); i++) {
+            force.getEnergyTermParameters(i, expression, type);
-//            variables.push_back(makeVariable(computedValueNames[i]+"1", "values"+computedValues->getParameterSuffix(i, "1")));
+            if (type == CustomGBForce::SingleParticle)
-//            variables.push_back(makeVariable(computedValueNames[i]+"2", "values"+computedValues->getParameterSuffix(i, "2")));
+                continue;
-//        }
+            bool exclude = (anyExclusions && type == CustomGBForce::ParticlePair);
-//        for (int i = 0; i < force.getNumGlobalParameters(); i++)
+            map<string, Lepton::ParsedExpression> n2EnergyExpressions;
-//            variables.push_back(makeVariable(force.getGlobalParameterName(i), "globals["+cu.intToString(i)+"]"));
+            n2EnergyExpressions["tempEnergy += "] = Lepton::Parser::parse(expression, functions).optimize();
-//        stringstream n2EnergySource;
+            n2EnergyExpressions["dEdR += "] = Lepton::Parser::parse(expression, functions).differentiate("r").optimize();
-//        bool anyExclusions = (force.getNumExclusions() > 0);
+            for (int j = 0; j < force.getNumComputedValues(); j++) {
-//        for (int i = 0; i < force.getNumEnergyTerms(); i++) {
+                if (needChainForValue[j]) {
-//            string expression;
+                    string index = cu.intToString(j+1);
-//            CustomGBForce::ComputationType type;
+                    n2EnergyExpressions["/*"+cu.intToString(i+1)+"*/ deriv"+index+"_1 += "] = energyDerivExpressions[i][2*j];
-//            force.getEnergyTermParameters(i, expression, type);
+                    n2EnergyExpressions["/*"+cu.intToString(i+1)+"*/ deriv"+index+"_2 += "] = energyDerivExpressions[i][2*j+1];
-//            if (type == CustomGBForce::SingleParticle)
+                }
-//                continue;
+            }
-//            bool exclude = (anyExclusions && type == CustomGBForce::ParticlePair);
+            if (exclude)
-//            map<string, Lepton::ParsedExpression> n2EnergyExpressions;
+                n2EnergySource << "if (!isExcluded) {\n";
-//            n2EnergyExpressions["tempEnergy += "] = Lepton::Parser::parse(expression, functions).optimize();
+            n2EnergySource << cu.getExpressionUtilities().createExpressions(n2EnergyExpressions, variables, functionDefinitions, "temp", prefix+"functionParams");
-//            n2EnergyExpressions["dEdR += "] = Lepton::Parser::parse(expression, functions).differentiate("r").optimize();
+            if (exclude)
-//            if (useLong) {
+                n2EnergySource << "}\n";
-//                for (int j = 0; j < force.getNumComputedValues(); j++) {
+        }
-//                    if (needChainForValue[j]) {
+        map<string, string> replacements;
-//                        string index = cu.intToString(j+1);
+        string n2EnergyStr = n2EnergySource.str();
-//                        n2EnergyExpressions["/*"+cu.intToString(i+1)+"*/ deriv"+index+"_1 += "] = energyDerivExpressions[i][2*j];
+        replacements["COMPUTE_INTERACTION"] = n2EnergyStr;
-//                        n2EnergyExpressions["/*"+cu.intToString(i+1)+"*/ deriv"+index+"_2 += "] = energyDerivExpressions[i][2*j+1];
+        stringstream extraArgs, atomParams, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
-//                    }
+        if (force.getNumGlobalParameters() > 0)
-//                }
+            extraArgs << ", const float* globals";
-//            }
+        pairEnergyUsesParam.resize(params->getBuffers().size(), false);
-//            else {
+        int atomParamSize = 7;
-//                for (int j = 0; j < force.getNumComputedValues(); j++) {
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//                    if (needChainForValue[j]) {
+            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
-//                        n2EnergyExpressions["/*"+cu.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j, "_1")+" += "] = energyDerivExpressions[i][2*j];
+            string paramName = "params"+cu.intToString(i+1);
-//                        n2EnergyExpressions["/*"+cu.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j, "_2")+" += "] = energyDerivExpressions[i][2*j+1];
+            if (n2EnergyStr.find(paramName+"1") != n2EnergyStr.npos || n2EnergyStr.find(paramName+"2") != n2EnergyStr.npos) {
-//                    }
+                extraArgs << ", const " << buffer.getType() << "* __restrict__ global_" << paramName;
-//                }
+                atomParams << buffer.getType() << " " << paramName << ";\n";
-//            }
+                loadLocal1 << "localData[localAtomIndex]." << paramName << " = " << paramName << "1;\n";
-//            if (exclude)
+                loadLocal2 << "localData[localAtomIndex]." << paramName << " = global_" << paramName << "[j];\n";
-//                n2EnergySource << "if (!isExcluded) {\n";
+                load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
-//            n2EnergySource << cu.getExpressionUtilities().createExpressions(n2EnergyExpressions, variables, functionDefinitions, "temp", prefix+"functionParams");
+                load2 << buffer.getType() << " " << paramName << "2 = localData[atom2]." << paramName << ";\n";
-//            if (exclude)
+                pairEnergyUsesParam[i] = true;
-//                n2EnergySource << "}\n";
+                atomParamSize += buffer.getNumComponents();
-//        }
+            }
-//        map<string, string> replacements;
+        }
-//        string n2EnergyStr = n2EnergySource.str();
+        pairEnergyUsesValue.resize(computedValues->getBuffers().size(), false);
-//        replacements["COMPUTE_INTERACTION"] = n2EnergyStr;
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
-//        stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
+            CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
-//        if (force.getNumGlobalParameters() > 0)
+            string valueName = "values"+cu.intToString(i+1);
-//            extraArgs << ", __global const float* globals";
+            if (n2EnergyStr.find(valueName+"1") != n2EnergyStr.npos || n2EnergyStr.find(valueName+"2") != n2EnergyStr.npos) {
-//        pairEnergyUsesParam.resize(params->getBuffers().size(), false);
+                extraArgs << ", const " << buffer.getType() << "* __restrict__ global_" << valueName;
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+                atomParams << buffer.getType() << " " << valueName << ";\n";
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+                loadLocal1 << "localData[localAtomIndex]." << valueName << " = " << valueName << "1;\n";
-//            string paramName = "params"+cu.intToString(i+1);
+                loadLocal2 << "localData[localAtomIndex]." << valueName << " = global_" << valueName << "[j];\n";
-//            if (n2EnergyStr.find(paramName+"1") != n2EnergyStr.npos || n2EnergyStr.find(paramName+"2") != n2EnergyStr.npos) {
+                load1 << buffer.getType() << " " << valueName << "1 = global_" << valueName << "[atom1];\n";
-//                extraArgs << ", __global const " << buffer.getType() << "* restrict global_" << paramName << ", __local " << buffer.getType() << "* restrict local_" << paramName;
+                load2 << buffer.getType() << " " << valueName << "2 = localData[atom2]." << valueName << ";\n";
-//                loadLocal1 << "local_" << paramName << "[localAtomIndex] = " << paramName << "1;\n";
+                pairEnergyUsesValue[i] = true;
-//                loadLocal2 << "local_" << paramName << "[localAtomIndex] = global_" << paramName << "[j];\n";
+                atomParamSize += buffer.getNumComponents();
-//                load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
+            }
-//                load2 << buffer.getType() << " " << paramName << "2 = local_" << paramName << "[atom2];\n";
+        }
-//                pairEnergyUsesParam[i] = true;
+        extraArgs << ", unsigned long long* __restrict__ derivBuffers";
-//            }
+        for (int i = 0; i < force.getNumComputedValues(); i++) {
-//        }
+            string index = cu.intToString(i+1);
-//        pairEnergyUsesValue.resize(computedValues->getBuffers().size(), false);
+            atomParams << "real deriv" << index << ";\n";
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
+            clearLocal << "localData[localAtomIndex].deriv" << index << " = 0;\n";
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
+            declare1 << "real deriv" << index << "_1 = 0;\n";
-//            string valueName = "values"+cu.intToString(i+1);
+            load2 << "real deriv" << index << "_2 = 0;\n";
-//            if (n2EnergyStr.find(valueName+"1") != n2EnergyStr.npos || n2EnergyStr.find(valueName+"2") != n2EnergyStr.npos) {
+            recordDeriv << "localData[atom2].deriv" << index << " += deriv" << index << "_2;\n";
-//                extraArgs << ", __global const " << buffer.getType() << "* restrict global_" << valueName << ", __local " << buffer.getType() << "* restrict local_" << valueName;
+            storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
-//                loadLocal1 << "local_" << valueName << "[localAtomIndex] = " << valueName << "1;\n";
+            storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
-//                loadLocal2 << "local_" << valueName << "[localAtomIndex] = global_" << valueName << "[j];\n";
+            declareTemps << "__local real tempDerivBuffer" << index << "[64];\n";
-//                load1 << buffer.getType() << " " << valueName << "1 = global_" << valueName << "[atom1];\n";
+            setTemps << "tempDerivBuffer" << index << "[threadIdx.x] = deriv" << index << "_1;\n";
-//                load2 << buffer.getType() << " " << valueName << "2 = local_" << valueName << "[atom2];\n";
+            atomParamSize++;
-//                pairEnergyUsesValue[i] = true;
+        }
-//            }
+        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
-//        }
+        replacements["ATOM_PARAMETER_DATA"] = atomParams.str();
-//        if (useLong) {
+        replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
-//            extraArgs << ", __global long* restrict derivBuffers";
+        replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
-//            for (int i = 0; i < force.getNumComputedValues(); i++) {
+        replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str();
-//                string index = cu.intToString(i+1);
+        replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
-//                extraArgs << ", __local float* restrict local_deriv" << index;
+        replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
-//                clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
+        replacements["DECLARE_ATOM1_DERIVATIVES"] = declare1.str();
-//                declare1 << "float deriv" << index << "_1 = 0.0f;\n";
+        replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
-//                load2 << "float deriv" << index << "_2 = 0.0f;\n";
+        replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
-//                recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
+        replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
-//                storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
+        replacements["DECLARE_TEMP_BUFFERS"] = declareTemps.str();
-//                storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
+        replacements["SET_TEMP_BUFFERS"] = setTemps.str();
-//                declareTemps << "__local float tempDerivBuffer" << index << "[64];\n";
+        map<string, string> defines;
-//                setTemps << "tempDerivBuffer" << index << "[get_local_id(0)] = deriv" << index << "_1;\n";
+        if (useCutoff)
-//            }
+            defines["USE_CUTOFF"] = "1";
-//        }
+        if (usePeriodic)
-//        else {
+            defines["USE_PERIODIC"] = "1";
-//            for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+        if (anyExclusions)
-//                const CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
+            defines["USE_EXCLUSIONS"] = "1";
-//                string index = cu.intToString(i+1);
+        if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
-//                extraArgs << ", __global " << buffer.getType() << "* restrict derivBuffers" << index << ", __local " << buffer.getType() << "* restrict local_deriv" << index;
+            defines["NEED_PADDING"] = "1";
-//                clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
+        defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
-//                declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
+        defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
-//                load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
+        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
-//                recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
+        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//                storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
+        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//                storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
+        defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
-//                declareTemps << "__local " << buffer.getType() << " tempDerivBuffer" << index << "[64];\n";
+        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBEnergyN2, replacements), defines);
-//                setTemps << "tempDerivBuffer" << index << "[get_local_id(0)] = deriv" << index << "_1;\n";
+        pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
-//            }
+    }
-//        }
+    {
-//        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
+        // Create the kernel to reduce the derivatives and calculate per-particle energy terms.
-//        replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
-//        replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
+        stringstream compute, extraArgs, load;
-//        replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str();
+        if (force.getNumGlobalParameters() > 0)
-//        replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+            extraArgs << ", const float* globals";
-//        replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//        replacements["DECLARE_ATOM1_DERIVATIVES"] = declare1.str();
+            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
-//        replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
+            string paramName = "params"+cu.intToString(i+1);
-//        replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
+            extraArgs << ", const " << buffer.getType() << "* __restrict__ " << paramName;
-//        replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
+        }
-//        replacements["DECLARE_TEMP_BUFFERS"] = declareTemps.str();
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
-//        replacements["SET_TEMP_BUFFERS"] = setTemps.str();
+            CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
-//        map<string, string> defines;
+            string valueName = "values"+cu.intToString(i+1);
-//        if (cu.getNonbondedUtilities().getForceBufferPerAtomBlock())
+            extraArgs << ", const " << buffer.getType() << "* __restrict__ " << valueName;
-//            defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
+        }
-//        if (useCutoff)
+        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
-//            defines["USE_CUTOFF"] = "1";
+            CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
-//        if (usePeriodic)
+            string index = cu.intToString(i+1);
-//            defines["USE_PERIODIC"] = "1";
+            extraArgs << ", " << buffer.getType() << "* __restrict__ derivBuffers" << index;
-//        if (anyExclusions)
+            compute << buffer.getType() << " deriv" << index << " = derivBuffers" << index << "[index];\n";
-//            defines["USE_EXCLUSIONS"] = "1";
+        }
-//        if (cu.getSIMDWidth() == 32)
+        extraArgs << ", const long long* __restrict__ derivBuffersIn";
-//            defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
+        for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
-//        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+            load << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
-//        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+                    " = RECIP(0xFFFFFFFF)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cu.intToString(i) << "];\n";
-//        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//        defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+        // Compute the various expressions.
-//        string file;
-//        if (deviceIsCpu)
+        map<string, string> variables;
-//            file = CudaKernelSources::customGBEnergyN2_cpu;
+        variables["x"] = "pos.x";
-//        else if (cu.getSIMDWidth() == 32)
+        variables["y"] = "pos.y";
-//            file = CudaKernelSources::customGBEnergyN2_nvidia;
+        variables["z"] = "pos.z";
-//        else
+        for (int i = 0; i < force.getNumPerParticleParameters(); i++)
-//            file = CudaKernelSources::customGBEnergyN2_default;
+            variables[force.getPerParticleParameterName(i)] = "params"+params->getParameterSuffix(i, "[index]");
-//        CUmodule module = cu.createModule(cu.replaceStrings(file, replacements), defines);
+        for (int i = 0; i < force.getNumGlobalParameters(); i++)
-//        pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
+            variables[force.getGlobalParameterName(i)] = "globals["+cu.intToString(i)+"]";
-//    }
+        for (int i = 0; i < force.getNumComputedValues(); i++)
-//    {
+            variables[computedValueNames[i]] = "values"+computedValues->getParameterSuffix(i, "[index]");
-//        // Create the kernel to reduce the derivatives and calculate per-particle energy terms.
+        map<string, Lepton::ParsedExpression> expressions;
-//
+        for (int i = 0; i < force.getNumEnergyTerms(); i++) {
-//        stringstream compute, extraArgs, reduce;
+            string expression;
-//        if (force.getNumGlobalParameters() > 0)
+            CustomGBForce::ComputationType type;
-//            extraArgs << ", __global const float* globals";
+            force.getEnergyTermParameters(i, expression, type);
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+            if (type != CustomGBForce::SingleParticle)
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+                continue;
-//            string paramName = "params"+cu.intToString(i+1);
+            Lepton::ParsedExpression parsed = Lepton::Parser::parse(expression, functions).optimize();
-//            extraArgs << ", __global const " << buffer.getType() << "* restrict " << paramName;
+            expressions["/*"+cu.intToString(i+1)+"*/ energy += "] = parsed;
-//        }
+            for (int j = 0; j < force.getNumComputedValues(); j++)
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
+                expressions["/*"+cu.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j)+" += "] = energyDerivExpressions[i][j];
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
+            Lepton::ParsedExpression gradx = parsed.differentiate("x").optimize();
-//            string valueName = "values"+cu.intToString(i+1);
+            Lepton::ParsedExpression grady = parsed.differentiate("y").optimize();
-//            extraArgs << ", __global const " << buffer.getType() << "* restrict " << valueName;
+            Lepton::ParsedExpression gradz = parsed.differentiate("z").optimize();
-//        }
+            if (!isZeroExpression(gradx))
-//        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+                expressions["/*"+cu.intToString(i+1)+"*/ force.x -= "] = gradx;
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
+            if (!isZeroExpression(grady))
-//            string index = cu.intToString(i+1);
+                expressions["/*"+cu.intToString(i+1)+"*/ force.y -= "] = grady;
-//            extraArgs << ", __global " << buffer.getType() << "* restrict derivBuffers" << index;
+            if (!isZeroExpression(gradz))
-//            compute << buffer.getType() << " deriv" << index << " = derivBuffers" << index << "[index];\n";
+                expressions["/*"+cu.intToString(i+1)+"*/ force.z -= "] = gradz;
-//        }
+        }
-//        if (useLong) {
+        for (int i = 1; i < force.getNumComputedValues(); i++)
-//            extraArgs << ", __global const long* restrict derivBuffersIn";
+            for (int j = 0; j < i; j++)
-//            for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
+                expressions["real dV"+cu.intToString(i)+"dV"+cu.intToString(j)+" = "] = valueDerivExpressions[i][j];
-//                reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
+        compute << cu.getExpressionUtilities().createExpressions(expressions, variables, functionDefinitions, "temp", prefix+"functionParams");
-//                        " = (1.0f/0xFFFFFFFF)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cu.intToString(i) << "];\n";
-//        }
+        // Record values.
-//        else {
-//            for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++)
+        compute << "forceBuffers[index] += (long long) (force.x*0xFFFFFFFF);\n";
-//                reduce << "REDUCE_VALUE(derivBuffers" << cu.intToString(i+1) << ", " << energyDerivs->getBuffers()[i].getType() << ")\n";
+        compute << "forceBuffers[index+PADDED_NUM_ATOMS] += (long long) (force.y*0xFFFFFFFF);\n";
-//        }
+        compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += (long long) (force.z*0xFFFFFFFF);\n";
-//        
+        for (int i = 1; i < force.getNumComputedValues(); i++) {
-//        // Compute the various expressions.
+            compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0";
-//        
+            for (int j = 1; j < i; j++)
-//        map<string, string> variables;
+                compute << " + totalDeriv"<<j<<"*dV"<<i<<"dV"<<j;
-//        variables["x"] = "pos.x";
+            compute << ";\n";
-//        variables["y"] = "pos.y";
+            compute << "deriv"<<(i+1)<<" *= totalDeriv"<<i<<";\n";
-//        variables["z"] = "pos.z";
+        }
-//        for (int i = 0; i < force.getNumPerParticleParameters(); i++)
+        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
-//            variables[force.getPerParticleParameterName(i)] = "params"+params->getParameterSuffix(i, "[index]");
+            string index = cu.intToString(i+1);
-//        for (int i = 0; i < force.getNumGlobalParameters(); i++)
+            compute << "derivBuffers" << index << "[index] = deriv" << index << ";\n";
-//            variables[force.getGlobalParameterName(i)] = "globals["+cu.intToString(i)+"]";
+        }
-//        for (int i = 0; i < force.getNumComputedValues(); i++)
+        map<string, string> replacements;
-//            variables[computedValueNames[i]] = "values"+computedValues->getParameterSuffix(i, "[index]");
+        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
-//        map<string, Lepton::ParsedExpression> expressions;
+        replacements["LOAD_DERIVATIVES"] = load.str();
-//        for (int i = 0; i < force.getNumEnergyTerms(); i++) {
+        replacements["COMPUTE_ENERGY"] = compute.str();
-//            string expression;
+        map<string, string> defines;
-//            CustomGBForce::ComputationType type;
+        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//            force.getEnergyTermParameters(i, expression, type);
+        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//            if (type != CustomGBForce::SingleParticle)
+        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBEnergyPerParticle, replacements), defines);
-//                continue;
+        perParticleEnergyKernel = cu.getKernel(module, "computePerParticleEnergy");
-//            Lepton::ParsedExpression parsed = Lepton::Parser::parse(expression, functions).optimize();
+    }
-//            expressions["/*"+cu.intToString(i+1)+"*/ energy += "] = parsed;
+    if (needParameterGradient) {
-//            for (int j = 0; j < force.getNumComputedValues(); j++)
+        // Create the kernel to compute chain rule terms for computed values that depend explicitly on particle coordinates.
-//                expressions["/*"+cu.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j)+" += "] = energyDerivExpressions[i][j];
-//            Lepton::ParsedExpression gradx = parsed.differentiate("x").optimize();
+        stringstream compute, extraArgs;
-//            Lepton::ParsedExpression grady = parsed.differentiate("y").optimize();
+        if (force.getNumGlobalParameters() > 0)
-//            Lepton::ParsedExpression gradz = parsed.differentiate("z").optimize();
+            extraArgs << ", const float* globals";
-//            if (!isZeroExpression(gradx))
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//                expressions["/*"+cu.intToString(i+1)+"*/ force.x -= "] = gradx;
+            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
-//            if (!isZeroExpression(grady))
+            string paramName = "params"+cu.intToString(i+1);
-//                expressions["/*"+cu.intToString(i+1)+"*/ force.y -= "] = grady;
+            extraArgs << ", const " << buffer.getType() << "* __restrict__ " << paramName;
-//            if (!isZeroExpression(gradz))
+        }
-//                expressions["/*"+cu.intToString(i+1)+"*/ force.z -= "] = gradz;
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
-//        }
+            CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
-//        for (int i = 1; i < force.getNumComputedValues(); i++)
+            string valueName = "values"+cu.intToString(i+1);
-//            for (int j = 0; j < i; j++)
+            extraArgs << ", const " << buffer.getType() << "* __restrict__ " << valueName;
-//                expressions["float dV"+cu.intToString(i)+"dV"+cu.intToString(j)+" = "] = valueDerivExpressions[i][j];
+        }
-//        compute << cu.getExpressionUtilities().createExpressions(expressions, variables, functionDefinitions, "temp", prefix+"functionParams");
+        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
-//        
+            CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
-//        // Record values.
+            string index = cu.intToString(i+1);
-//        
+            extraArgs << ", " << buffer.getType() << "* __restrict__ derivBuffers" << index;
-//        compute << "forceBuffers[index] = forceBuffers[index]+force;\n";
+            compute << buffer.getType() << " deriv" << index << " = derivBuffers" << index << "[index];\n";
-//        for (int i = 1; i < force.getNumComputedValues(); i++) {
+        }
-//            compute << "float totalDeriv"<<i<<" = dV"<<i<<"dV0";
+        map<string, string> variables;
-//            for (int j = 1; j < i; j++)
+        variables["x"] = "pos.x";
-//                compute << " + totalDeriv"<<j<<"*dV"<<i<<"dV"<<j;
+        variables["y"] = "pos.y";
-//            compute << ";\n";
+        variables["z"] = "pos.z";
-//            compute << "deriv"<<(i+1)<<" *= totalDeriv"<<i<<";\n";
+        for (int i = 0; i < force.getNumPerParticleParameters(); i++)
-//        }
+            variables[force.getPerParticleParameterName(i)] = "params"+params->getParameterSuffix(i, "[index]");
-//        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+        for (int i = 0; i < force.getNumGlobalParameters(); i++)
-//            string index = cu.intToString(i+1);
+            variables[force.getGlobalParameterName(i)] = "globals["+cu.intToString(i)+"]";
-//            compute << "derivBuffers" << index << "[index] = deriv" << index << ";\n";
+        for (int i = 0; i < force.getNumComputedValues(); i++)
-//        }
+            variables[computedValueNames[i]] = "values"+computedValues->getParameterSuffix(i, "[index]");
-//        map<string, string> replacements;
+        for (int i = 1; i < force.getNumComputedValues(); i++) {
-//        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
+            string is = cu.intToString(i);
-//        replacements["REDUCE_DERIVATIVES"] = reduce.str();
+            compute << "real3 dV"<<is<<"dR = make_real3(0);\n";
-//        replacements["COMPUTE_ENERGY"] = compute.str();
+            for (int j = 1; j < i; j++) {
-//        map<string, string> defines;
+                if (!isZeroExpression(valueDerivExpressions[i][j])) {
-//        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+                    map<string, Lepton::ParsedExpression> derivExpressions;
-//        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+                    string js = cu.intToString(j);
-//        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBEnergyPerParticle, replacements), defines);
+                    derivExpressions["real dV"+is+"dV"+js+" = "] = valueDerivExpressions[i][j];
-//        perParticleEnergyKernel = cu.getKernel(module, "computePerParticleEnergy");
+                    compute << cu.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, "temp_"+is+"_"+js, prefix+"functionParams");
-//    }
+                    compute << "dV"<<is<<"dR += dV"<<is<<"dV"<<js<<"*dV"<<js<<"dR;\n";
-//    if (needParameterGradient) {
+                }
-//        // Create the kernel to compute chain rule terms for computed values that depend explicitly on particle coordinates.
+            }
-//
+            map<string, Lepton::ParsedExpression> gradientExpressions;
-//        stringstream compute, extraArgs;
+            if (!isZeroExpression(valueGradientExpressions[i][0]))
-//        if (force.getNumGlobalParameters() > 0)
+                gradientExpressions["dV"+is+"dR.x += "] = valueGradientExpressions[i][0];
-//            extraArgs << ", __global const float* globals";
+            if (!isZeroExpression(valueGradientExpressions[i][1]))
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+                gradientExpressions["dV"+is+"dR.y += "] = valueGradientExpressions[i][1];
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+            if (!isZeroExpression(valueGradientExpressions[i][2]))
-//            string paramName = "params"+cu.intToString(i+1);
+                gradientExpressions["dV"+is+"dR.z += "] = valueGradientExpressions[i][2];
-//            extraArgs << ", __global const " << buffer.getType() << "* restrict " << paramName;
+            compute << cu.getExpressionUtilities().createExpressions(gradientExpressions, variables, functionDefinitions, "temp", prefix+"functionParams");
-//        }
+        }
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
+        for (int i = 1; i < force.getNumComputedValues(); i++) {
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
+            string is = cu.intToString(i);
-//            string valueName = "values"+cu.intToString(i+1);
+            compute << "force -= deriv"<<energyDerivs->getParameterSuffix(i)<<"*dV"<<is<<"dR;\n";
-//            extraArgs << ", __global const " << buffer.getType() << "* restrict " << valueName;
+        }
-//        }
+        map<string, string> replacements;
-//        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
+        replacements["COMPUTE_FORCES"] = compute.str();
-//            string index = cu.intToString(i+1);
+        map<string, string> defines;
-//            extraArgs << ", __global " << buffer.getType() << "* restrict derivBuffers" << index;
+        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//            compute << buffer.getType() << " deriv" << index << " = derivBuffers" << index << "[index];\n";
+        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//        }
+        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBGradientChainRule, replacements), defines);
-//        map<string, string> variables;
+        gradientChainRuleKernel = cu.getKernel(module, "computeGradientChainRuleTerms");
-//        variables["x"] = "pos.x";
+    }
-//        variables["y"] = "pos.y";
+    {
-//        variables["z"] = "pos.z";
+        // Create the code to calculate chain rules terms as part of the default nonbonded kernel.
-//        for (int i = 0; i < force.getNumPerParticleParameters(); i++)
-//            variables[force.getPerParticleParameterName(i)] = "params"+params->getParameterSuffix(i, "[index]");
+        vector<pair<ExpressionTreeNode, string> > globalVariables;
-//        for (int i = 0; i < force.getNumGlobalParameters(); i++)
+        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-//            variables[force.getGlobalParameterName(i)] = "globals["+cu.intToString(i)+"]";
+            const string& name = force.getGlobalParameterName(i);
-//        for (int i = 0; i < force.getNumComputedValues(); i++)
+            string value = "globals["+cu.intToString(i)+"]";
-//            variables[computedValueNames[i]] = "values"+computedValues->getParameterSuffix(i, "[index]");
+            globalVariables.push_back(makeVariable(name, prefix+value));
-//        for (int i = 1; i < force.getNumComputedValues(); i++) {
+        }
-//            string is = cu.intToString(i);
+        vector<pair<ExpressionTreeNode, string> > variables = globalVariables;
-//            compute << "float4 dV"<<is<<"dR = (float4) 0;\n";
+        map<string, string> rename;
-//            for (int j = 1; j < i; j++) {
+        ExpressionTreeNode rnode(new Operation::Variable("r"));
-//                if (!isZeroExpression(valueDerivExpressions[i][j])) {
+        variables.push_back(make_pair(rnode, "r"));
-//                    map<string, Lepton::ParsedExpression> derivExpressions;
+        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Square(), rnode), "r2"));
-//                    string js = cu.intToString(j);
+        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Reciprocal(), rnode), "invR"));
-//                    derivExpressions["float dV"+is+"dV"+js+" = "] = valueDerivExpressions[i][j];
+        for (int i = 0; i < force.getNumPerParticleParameters(); i++) {
-//                    compute << cu.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, "temp_"+is+"_"+js, prefix+"functionParams");
+            const string& name = force.getPerParticleParameterName(i);
-//                    compute << "dV"<<is<<"dR += dV"<<is<<"dV"<<js<<"*dV"<<js<<"dR;\n";
+            variables.push_back(makeVariable(name+"1", prefix+"params"+params->getParameterSuffix(i, "1")));
-//                }
+            variables.push_back(makeVariable(name+"2", prefix+"params"+params->getParameterSuffix(i, "2")));
-//            }
+            rename[name+"1"] = name+"2";
-//            map<string, Lepton::ParsedExpression> gradientExpressions;
+            rename[name+"2"] = name+"1";
-//            if (!isZeroExpression(valueGradientExpressions[i][0]))
+        }
-//                gradientExpressions["dV"+is+"dR.x += "] = valueGradientExpressions[i][0];
+        map<string, Lepton::ParsedExpression> derivExpressions;
-//            if (!isZeroExpression(valueGradientExpressions[i][1]))
+        stringstream chainSource;
-//                gradientExpressions["dV"+is+"dR.y += "] = valueGradientExpressions[i][1];
+        Lepton::ParsedExpression dVdR = Lepton::Parser::parse(computedValueExpressions[0], functions).differentiate("r").optimize();
-//            if (!isZeroExpression(valueGradientExpressions[i][2]))
+        derivExpressions["real dV0dR1 = "] = dVdR;
-//                gradientExpressions["dV"+is+"dR.z += "] = valueGradientExpressions[i][2];
+        derivExpressions["real dV0dR2 = "] = dVdR.renameVariables(rename);
-//            compute << cu.getExpressionUtilities().createExpressions(gradientExpressions, variables, functionDefinitions, "temp", prefix+"functionParams");
+        chainSource << cu.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, prefix+"temp0_", prefix+"functionParams");
-//        }
+        if (needChainForValue[0]) {
-//        for (int i = 1; i < force.getNumComputedValues(); i++) {
+            if (useExclusionsForValue)
-//            string is = cu.intToString(i);
+                chainSource << "if (!isExcluded) {\n";
-//            compute << "force -= deriv"<<energyDerivs->getParameterSuffix(i)<<"*dV"<<is<<"dR;\n";
+            chainSource << "tempForce -= dV0dR1*" << prefix << "dEdV" << energyDerivs->getParameterSuffix(0, "1") << ";\n";
-//        }
+            chainSource << "tempForce -= dV0dR2*" << prefix << "dEdV" << energyDerivs->getParameterSuffix(0, "2") << ";\n";
-//        map<string, string> replacements;
+            if (useExclusionsForValue)
-//        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
+                chainSource << "}\n";
-//        replacements["COMPUTE_FORCES"] = compute.str();
+        }
-//        map<string, string> defines;
+        for (int i = 1; i < force.getNumComputedValues(); i++) {
-//        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+            if (needChainForValue[i]) {
-//        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBGradientChainRule, replacements), defines);
+                chainSource << "tempForce -= dV0dR1*" << prefix << "dEdV" << energyDerivs->getParameterSuffix(i, "1") << ";\n";
-//        gradientChainRuleKernel = cu.getKernel(module, "computeGradientChainRuleTerms");
+                chainSource << "tempForce -= dV0dR2*" << prefix << "dEdV" << energyDerivs->getParameterSuffix(i, "2") << ";\n";
-//    }
+            }
-//    {
+        }
-//        // Create the code to calculate chain rules terms as part of the default nonbonded kernel.
+        map<string, string> replacements;
-//
+        string chainStr = chainSource.str();
-//        vector<pair<ExpressionTreeNode, string> > globalVariables;
+        replacements["COMPUTE_FORCE"] = chainStr;
-//        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        string source = cu.replaceStrings(CudaKernelSources::customGBChainRule, replacements);
-//            const string& name = force.getGlobalParameterName(i);
+        vector<CudaNonbondedUtilities::ParameterInfo> parameters;
-//            string value = "globals["+cu.intToString(i)+"]";
+        vector<CudaNonbondedUtilities::ParameterInfo> arguments;
-//            globalVariables.push_back(makeVariable(name, prefix+value));
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//        }
+            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
-//        vector<pair<ExpressionTreeNode, string> > variables = globalVariables;
+            string paramName = prefix+"params"+cu.intToString(i+1);
-//        map<string, string> rename;
+            if (chainStr.find(paramName+"1") != chainStr.npos || chainStr.find(paramName+"2") != chainStr.npos)
-//        ExpressionTreeNode rnode(new Operation::Variable("r"));
+                parameters.push_back(CudaNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
-//        variables.push_back(make_pair(rnode, "r"));
+        }
-//        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Square(), rnode), "r2"));
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
-//        variables.push_back(make_pair(ExpressionTreeNode(new Operation::Reciprocal(), rnode), "invR"));
+            CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
-//        for (int i = 0; i < force.getNumPerParticleParameters(); i++) {
+            string paramName = prefix+"values"+cu.intToString(i+1);
-//            const string& name = force.getPerParticleParameterName(i);
+            if (chainStr.find(paramName+"1") != chainStr.npos || chainStr.find(paramName+"2") != chainStr.npos)
-//            variables.push_back(makeVariable(name+"1", prefix+"params"+params->getParameterSuffix(i, "1")));
+                parameters.push_back(CudaNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
-//            variables.push_back(makeVariable(name+"2", prefix+"params"+params->getParameterSuffix(i, "2")));
+        }
-//            rename[name+"1"] = name+"2";
+        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
-//            rename[name+"2"] = name+"1";
+            if (needChainForValue[i]) { 
-//        }
+                CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
-//        map<string, Lepton::ParsedExpression> derivExpressions;
+                string paramName = prefix+"dEdV"+cu.intToString(i+1);
-//        stringstream chainSource;
+                parameters.push_back(CudaNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
-//        Lepton::ParsedExpression dVdR = Lepton::Parser::parse(computedValueExpressions[0], functions).differentiate("r").optimize();
+            }
-//        derivExpressions["float dV0dR1 = "] = dVdR;
+        }
-//        derivExpressions["float dV0dR2 = "] = dVdR.renameVariables(rename);
+        if (globals != NULL) {
-//        chainSource << cu.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, prefix+"temp0_", prefix+"functionParams");
+            globals->upload(globalParamValues);
-//        if (needChainForValue[0]) {
+            arguments.push_back(CudaNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(float), globals->getDevicePointer()));
-//            if (useExclusionsForValue)
+        }
-//                chainSource << "if (!isExcluded) {\n";
+        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, force.getNumExclusions() > 0, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
-//            chainSource << "tempForce -= dV0dR1*" << prefix << "dEdV" << energyDerivs->getParameterSuffix(0, "1") << ";\n";
+        for (int i = 0; i < (int) parameters.size(); i++)
-//            chainSource << "tempForce -= dV0dR2*" << prefix << "dEdV" << energyDerivs->getParameterSuffix(0, "2") << ";\n";
+            cu.getNonbondedUtilities().addParameter(parameters[i]);
-//            if (useExclusionsForValue)
+        for (int i = 0; i < (int) arguments.size(); i++)
-//                chainSource << "}\n";
+            cu.getNonbondedUtilities().addArgument(arguments[i]);
-//        }
+    }
-//        for (int i = 1; i < force.getNumComputedValues(); i++) {
+    cu.addForce(new CudaCustomGBForceInfo(force));
-//            if (needChainForValue[i]) {
+    cu.addAutoclearBuffer(longEnergyDerivs->getDevicePointer(), sizeof(long long)*longEnergyDerivs->getSize());
-//                chainSource << "tempForce -= dV0dR1*" << prefix << "dEdV" << energyDerivs->getParameterSuffix(i, "1") << ";\n";
+}
-//                chainSource << "tempForce -= dV0dR2*" << prefix << "dEdV" << energyDerivs->getParameterSuffix(i, "2") << ";\n";
-//            }
+double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-//        }
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
-//        map<string, string> replacements;
+    if (!hasInitializedKernels) {
-//        string chainStr = chainSource.str();
+        hasInitializedKernels = true;
-//        replacements["COMPUTE_FORCE"] = chainStr;
+        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
-//        string source = cu.replaceStrings(CudaKernelSources::customGBChainRule, replacements);
+        valueBuffers = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "customGBValueBuffers");
-//        vector<CudaNonbondedUtilities::ParameterInfo> parameters;
+        cu.addAutoclearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
-//        vector<CudaNonbondedUtilities::ParameterInfo> arguments;
+        cu.clearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+        pairValueArgs.push_back(&cu.getPosq().getDevicePointer());
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
-//            string paramName = prefix+"params"+cu.intToString(i+1);
+        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer());
-//            if (chainStr.find(paramName+"1") != chainStr.npos || chainStr.find(paramName+"2") != chainStr.npos)
+        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
-//                parameters.push_back(CudaNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
+        pairValueArgs.push_back(&valueBuffers->getDevicePointer());
-//        }
+        if (nb.getUseCutoff()) {
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
+            pairValueArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
+            pairValueArgs.push_back(&nb.getInteractionCount().getDevicePointer());
-//            string paramName = prefix+"values"+cu.intToString(i+1);
+            pairValueArgs.push_back(cu.getPeriodicBoxSizePointer());
-//            if (chainStr.find(paramName+"1") != chainStr.npos || chainStr.find(paramName+"2") != chainStr.npos)
+            pairValueArgs.push_back(cu.getInvPeriodicBoxSizePointer());
-//                parameters.push_back(CudaNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
+            pairValueArgs.push_back(&maxTiles);
-//        }
+            pairValueArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
-//        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+        }
-//            if (needChainForValue[i]) { 
+        else
-//                const CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
+            pairValueArgs.push_back(&maxTiles);
-//                string paramName = prefix+"dEdV"+cu.intToString(i+1);
+        if (globals != NULL)
-//                parameters.push_back(CudaNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
+            pairValueArgs.push_back(&globals->getDevicePointer());
-//            }
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//        }
+            if (pairValueUsesParam[i])
-//        if (globals != NULL) {
+                pairValueArgs.push_back(&params->getBuffers()[i].getMemory());
-//            globals->upload(globalParamValues);
+        }
-//            arguments.push_back(CudaNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDevicePointer()));
+        if (tabulatedFunctionParams != NULL) {
-//        }
+            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, force.getNumExclusions() > 0, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+                pairValueArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
-//        for (int i = 0; i < (int) parameters.size(); i++)
+            pairValueArgs.push_back(&tabulatedFunctionParams->getDevicePointer());
-//            cu.getNonbondedUtilities().addParameter(parameters[i]);
+        }
-//        for (int i = 0; i < (int) arguments.size(); i++)
+        perParticleValueArgs.push_back(&cu.getPosq().getDevicePointer());
-//            cu.getNonbondedUtilities().addArgument(arguments[i]);
+        perParticleValueArgs.push_back(&valueBuffers->getDevicePointer());
-//    }
+        if (globals != NULL)
-//    cu.addForce(new CudaCustomGBForceInfo(cu.getNonbondedUtilities().getNumForceBuffers(), force));
+            perParticleValueArgs.push_back(&globals->getDevicePointer());
-//    if (useLong)
+        for (int i = 0; i < (int) params->getBuffers().size(); i++)
-//        cu.addAutoclearBuffer(longEnergyDerivs->getDevicePointer(), 2*longEnergyDerivs->getSize());
+            perParticleValueArgs.push_back(&params->getBuffers()[i].getMemory());
-//    else {
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++)
-//        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+            perParticleValueArgs.push_back(&computedValues->getBuffers()[i].getMemory());
-//            const CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
+        if (tabulatedFunctionParams != NULL) {
-//            cu.addAutoclearBuffer(buffer.getMemory(), buffer.getSize()*energyDerivs->getNumObjects()/sizeof(cl_float));
+            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//        }
+                perParticleValueArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
-//    }
+            perParticleValueArgs.push_back(&tabulatedFunctionParams->getDevicePointer());
-//}
+        }
-//
+        pairEnergyArgs.push_back(&cu.getForce().getDevicePointer());
-//double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+        pairEnergyArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
-//    bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
+        pairEnergyArgs.push_back(&cu.getPosq().getDevicePointer());
-//    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+        pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
-//    if (!hasInitializedKernels) {
+        pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer());
-//        hasInitializedKernels = true;
+        pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
-//        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
+        if (nb.getUseCutoff()) {
-//        bool useLong = (cu.getSupports64BitGlobalAtomics() && !deviceIsCpu);
+            pairEnergyArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
-//        if (useLong) {
+            pairEnergyArgs.push_back(&nb.getInteractionCount().getDevicePointer());
-//            longValueBuffers = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "customGBLongValueBuffers");
+            pairEnergyArgs.push_back(cu.getPeriodicBoxSizePointer());
-//            cu.addAutoclearBuffer(longValueBuffers->getDevicePointer(), 2*longValueBuffers->getSize());
+            pairEnergyArgs.push_back(cu.getInvPeriodicBoxSizePointer());
-//            cu.clearBuffer(longValueBuffers->getDevicePointer(), 2*longValueBuffers->getSize());
+            pairEnergyArgs.push_back(&maxTiles);
-//        }
+            pairEnergyArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
-//        else {
+        }
-//            valueBuffers = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "customGBValueBuffers");
+        else
-//            cu.addAutoclearBuffer(valueBuffers->getDevicePointer(), valueBuffers->getSize());
+            pairEnergyArgs.push_back(&maxTiles);
-//            cu.clearBuffer(*valueBuffers);
+        if (globals != NULL)
-//        }
+            pairEnergyArgs.push_back(&globals->getDevicePointer());
-//        int index = 0;
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-//        pairValueKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
+            if (pairEnergyUsesParam[i])
-//        pairValueKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
+                pairEnergyArgs.push_back(&params->getBuffers()[i].getMemory());
-//        pairValueKernel.setArg<cu::Buffer>(index++, cu.getNonbondedUtilities().getExclusions().getDevicePointer());
+        }
-//        pairValueKernel.setArg<cu::Buffer>(index++, cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer());
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
-//        pairValueKernel.setArg<cu::Buffer>(index++, cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
+            if (pairEnergyUsesValue[i])
-//        pairValueKernel.setArg<cu::Buffer>(index++, useLong ? longValueBuffers->getDevicePointer() : valueBuffers->getDevicePointer());
+                pairEnergyArgs.push_back(&computedValues->getBuffers()[i].getMemory());
-//        pairValueKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
+        }
-//        /// \todo Eliminate this argument and make local to the kernel. For *_default.cu kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cu.
+        pairEnergyArgs.push_back(&longEnergyDerivs->getDevicePointer());
-//        /// \todo Also make the previous __local argument local as was done with nonbonded_default.cu.
+        if (tabulatedFunctionParams != NULL) {
-//        pairValueKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
+            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//        if (nb.getUseCutoff()) {
+                pairEnergyArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
-//            pairValueKernel.setArg<cu::Buffer>(index++, nb.getInteractingTiles().getDevicePointer());
+            pairEnergyArgs.push_back(&tabulatedFunctionParams->getDevicePointer());
-//            pairValueKernel.setArg<cu::Buffer>(index++, nb.getInteractionCount().getDevicePointer());
+        }
-//            index += 2; // Periodic box size arguments are set when the kernel is executed.
+        perParticleEnergyArgs.push_back(&cu.getForce().getDevicePointer());
-//            pairValueKernel.setArg<cl_uint>(index++, maxTiles);
+        perParticleEnergyArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
-//            if (cu.getSIMDWidth() == 32 || deviceIsCpu)
+        perParticleEnergyArgs.push_back(&cu.getPosq().getDevicePointer());
-//                pairValueKernel.setArg<cu::Buffer>(index++, nb.getInteractionFlags().getDevicePointer());
+        if (globals != NULL)
-//        }
+            perParticleEnergyArgs.push_back(&globals->getDevicePointer());
-//        else
+        for (int i = 0; i < (int) params->getBuffers().size(); i++)
-//            pairValueKernel.setArg<cl_uint>(index++, cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
+            perParticleEnergyArgs.push_back(&params->getBuffers()[i].getMemory());
-//        if (globals != NULL)
+        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++)
-//            pairValueKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
+            perParticleEnergyArgs.push_back(&computedValues->getBuffers()[i].getMemory());
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++)
-//            if (pairValueUsesParam[i]) {
+            perParticleEnergyArgs.push_back(&energyDerivs->getBuffers()[i].getMemory());
-//                const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        perParticleEnergyArgs.push_back(&longEnergyDerivs->getDevicePointer());
-//                pairValueKernel.setArg<cu::Memory>(index++, buffer.getMemory());
+        if (tabulatedFunctionParams != NULL) {
-//                pairValueKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*buffer.getSize(), NULL);
+            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//            }
+                perParticleEnergyArgs.push_back(&tabulatedFunctions[i]->getDevicePointer());
-//        }
+            perParticleEnergyArgs.push_back(&tabulatedFunctionParams->getDevicePointer());
-//        if (tabulatedFunctionParams != NULL) {
+        }
-//            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+        if (needParameterGradient) {
-//                pairValueKernel.setArg<cu::Buffer>(index++, tabulatedFunctions[i]->getDevicePointer());
+            gradientChainRuleArgs.push_back(&cu.getForce().getDevicePointer());
-//            pairValueKernel.setArg<cu::Buffer>(index++, tabulatedFunctionParams->getDevicePointer());
+            gradientChainRuleArgs.push_back(&cu.getPosq().getDevicePointer());
-//        }
+            if (globals != NULL)
-//        index = 0;
+                gradientChainRuleArgs.push_back(&globals->getDevicePointer());
-//        perParticleValueKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms());
+            for (int i = 0; i < (int) params->getBuffers().size(); i++)
-//        perParticleValueKernel.setArg<cl_int>(index++, nb.getNumForceBuffers());
+                gradientChainRuleArgs.push_back(&params->getBuffers()[i].getMemory());
-//        perParticleValueKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
+            for (int i = 0; i < (int) computedValues->getBuffers().size(); i++)
-//        perParticleValueKernel.setArg<cu::Buffer>(index++, useLong ? longValueBuffers->getDevicePointer() : valueBuffers->getDevicePointer());
+                gradientChainRuleArgs.push_back(&computedValues->getBuffers()[i].getMemory());
-//        if (globals != NULL)
+            for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++)
-//            perParticleValueKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
+                gradientChainRuleArgs.push_back(&energyDerivs->getBuffers()[i].getMemory());
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++)
+        }
-//            perParticleValueKernel.setArg<cu::Memory>(index++, params->getBuffers()[i].getMemory());
+    }
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++)
+    if (globals != NULL) {
-//            perParticleValueKernel.setArg<cu::Memory>(index++, computedValues->getBuffers()[i].getMemory());
+        bool changed = false;
-//        if (tabulatedFunctionParams != NULL) {
+        for (int i = 0; i < (int) globalParamNames.size(); i++) {
-//            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+            float value = (float) context.getParameter(globalParamNames[i]);
-//                perParticleValueKernel.setArg<cu::Buffer>(index++, tabulatedFunctions[i]->getDevicePointer());
+            if (value != globalParamValues[i])
-//            perParticleValueKernel.setArg<cu::Buffer>(index++, tabulatedFunctionParams->getDevicePointer());
+                changed = true;
-//        }
+            globalParamValues[i] = value;
-//        index = 0;
+        }
-//        pairEnergyKernel.setArg<cu::Buffer>(index++, useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForce().getDevicePointer());
+        if (changed)
-//        pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
+            globals->upload(globalParamValues);
-//        pairEnergyKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
+    }
-//        pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
+    if (nb.getUseCutoff()) {
-//        pairEnergyKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
+        if (maxTiles < nb.getInteractingTiles().getSize()) {
-//        pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getNonbondedUtilities().getExclusions().getDevicePointer());
+            maxTiles = nb.getInteractingTiles().getSize();
-//        pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer());
+            pairValueArgs[5] = &nb.getInteractingTiles().getDevicePointer();
-//        pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
+            pairEnergyArgs[6] = &nb.getInteractingTiles().getDevicePointer();
-//        /// \todo Eliminate this argument and make local to the kernel. For *_default.cu kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cu.
+            pairValueArgs[10] = &nb.getInteractionFlags().getDevicePointer();
-//        /// \todo Also make the previous __local argument local as was done with nonbonded_default.cu.
+            pairEnergyArgs[11] = &nb.getInteractionFlags().getDevicePointer();
-//        pairEnergyKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
+        }
-//        if (nb.getUseCutoff()) {
+    }
-//            pairEnergyKernel.setArg<cu::Buffer>(index++, nb.getInteractingTiles().getDevicePointer());
+    cu.executeKernel(pairValueKernel, &pairValueArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
-//            pairEnergyKernel.setArg<cu::Buffer>(index++, nb.getInteractionCount().getDevicePointer());
+    cu.executeKernel(perParticleValueKernel, &perParticleValueArgs[0], cu.getPaddedNumAtoms());
-//            index += 2; // Periodic box size arguments are set when the kernel is executed.
+    cu.executeKernel(pairEnergyKernel, &pairEnergyArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
-//            pairEnergyKernel.setArg<cl_uint>(index++, maxTiles);
+    cu.executeKernel(perParticleEnergyKernel, &perParticleEnergyArgs[0], cu.getPaddedNumAtoms());
-//            if (cu.getSIMDWidth() == 32 || deviceIsCpu)
+    if (needParameterGradient)
-//                pairEnergyKernel.setArg<cu::Buffer>(index++, nb.getInteractionFlags().getDevicePointer());
+        cu.executeKernel(gradientChainRuleKernel, &gradientChainRuleArgs[0], cu.getPaddedNumAtoms());
-//        }
+    return 0.0;
-//        else
+}
-//            pairEnergyKernel.setArg<cl_uint>(index++, cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
-//        if (globals != NULL)
+void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context, const CustomGBForce& force) {
-//            pairEnergyKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
+    cu.setAsCurrent();
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+    int numParticles = force.getNumParticles();
-//            if (pairEnergyUsesParam[i]) {
+    if (numParticles != cu.getNumAtoms())
-//                const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
-//                pairEnergyKernel.setArg<cu::Memory>(index++, buffer.getMemory());
-//                pairEnergyKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*buffer.getSize(), NULL);
+    // Record the per-particle parameters.
-//            }
-//        }
+    vector<vector<float> > paramVector(numParticles);
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
+    vector<double> parameters;
-//            if (pairEnergyUsesValue[i]) {
+    for (int i = 0; i < numParticles; i++) {
-//                const CudaNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
+        force.getParticleParameters(i, parameters);
-//                pairEnergyKernel.setArg<cu::Memory>(index++, buffer.getMemory());
+        paramVector[i].resize(parameters.size());
-//                pairEnergyKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*buffer.getSize(), NULL);
+        for (int j = 0; j < (int) parameters.size(); j++)
-//            }
+            paramVector[i][j] = (float) parameters[j];
-//        }
+    }
-//        if (useLong) {
+    params->setParameterValues(paramVector);
-//            pairEnergyKernel.setArg<cu::Memory>(index++, longEnergyDerivs->getDevicePointer());
-//            for (int i = 0; i < numComputedValues; ++i)
+    // Mark that the current reordering may be invalid.
-//                pairEnergyKernel.setArg(index++, nb.getForceThreadBlockSize()*sizeof(cl_float), NULL);
-//        }
+    cu.invalidateMolecules();
-//        else {
+}
-//            for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
-//                const CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
-//                pairEnergyKernel.setArg<cu::Memory>(index++, buffer.getMemory());
-//                pairEnergyKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*buffer.getSize(), NULL);
-//            }
-//        }
-//        if (tabulatedFunctionParams != NULL) {
-//            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//                pairEnergyKernel.setArg<cu::Buffer>(index++, tabulatedFunctions[i]->getDevicePointer());
-//            pairEnergyKernel.setArg<cu::Buffer>(index++, tabulatedFunctionParams->getDevicePointer());
-//        }
-//        index = 0;
-//        perParticleEnergyKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms());
-//        perParticleEnergyKernel.setArg<cl_int>(index++, nb.getNumForceBuffers());
-//        perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getForce().getDevicePointer());
-//        perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
-//        perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
-//        if (globals != NULL)
-//            perParticleEnergyKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
-//        for (int i = 0; i < (int) params->getBuffers().size(); i++)
-//            perParticleEnergyKernel.setArg<cu::Memory>(index++, params->getBuffers()[i].getMemory());
-//        for (int i = 0; i < (int) computedValues->getBuffers().size(); i++)
-//            perParticleEnergyKernel.setArg<cu::Memory>(index++, computedValues->getBuffers()[i].getMemory());
-//        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++)
-//            perParticleEnergyKernel.setArg<cu::Memory>(index++, energyDerivs->getBuffers()[i].getMemory());
-//        if (useLong)
-//            perParticleEnergyKernel.setArg<cu::Memory>(index++, longEnergyDerivs->getDevicePointer());
-//        if (tabulatedFunctionParams != NULL) {
-//            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
-//                perParticleEnergyKernel.setArg<cu::Buffer>(index++, tabulatedFunctions[i]->getDevicePointer());
-//            perParticleEnergyKernel.setArg<cu::Buffer>(index++, tabulatedFunctionParams->getDevicePointer());
-//        }
-//        if (needParameterGradient) {
-//            index = 0;
-//            gradientChainRuleKernel.setArg<cu::Buffer>(index++, cu.getForce().getDevicePointer());
-//            gradientChainRuleKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
-//            if (globals != NULL)
-//                gradientChainRuleKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
-//            for (int i = 0; i < (int) params->getBuffers().size(); i++)
-//                gradientChainRuleKernel.setArg<cu::Memory>(index++, params->getBuffers()[i].getMemory());
-//            for (int i = 0; i < (int) computedValues->getBuffers().size(); i++)
-//                gradientChainRuleKernel.setArg<cu::Memory>(index++, computedValues->getBuffers()[i].getMemory());
-//            for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++)
-//                gradientChainRuleKernel.setArg<cu::Memory>(index++, energyDerivs->getBuffers()[i].getMemory());
-//        }
-//    }
-//    if (globals != NULL) {
-//        bool changed = false;
-//        for (int i = 0; i < (int) globalParamNames.size(); i++) {
-//            cl_float value = (cl_float) context.getParameter(globalParamNames[i]);
-//            if (value != globalParamValues[i])
-//                changed = true;
-//            globalParamValues[i] = value;
-//        }
-//        if (changed)
-//            globals->upload(globalParamValues);
-//    }
-//    if (nb.getUseCutoff()) {
-//        pairValueKernel.setArg<mm_float4>(10, cu.getPeriodicBoxSize());
-//        pairValueKernel.setArg<mm_float4>(11, cu.getInvPeriodicBoxSize());
-//        pairEnergyKernel.setArg<mm_float4>(11, cu.getPeriodicBoxSize());
-//        pairEnergyKernel.setArg<mm_float4>(12, cu.getInvPeriodicBoxSize());
-//        if (maxTiles < nb.getInteractingTiles().getSize()) {
-//            maxTiles = nb.getInteractingTiles().getSize();
-//            pairValueKernel.setArg<cu::Buffer>(8, nb.getInteractingTiles().getDevicePointer());
-//            pairValueKernel.setArg<cl_uint>(12, maxTiles);
-//            pairEnergyKernel.setArg<cu::Buffer>(9, nb.getInteractingTiles().getDevicePointer());
-//            pairEnergyKernel.setArg<cl_uint>(13, maxTiles);
-//            if (cu.getSIMDWidth() == 32 || deviceIsCpu) {
-//                pairValueKernel.setArg<cu::Buffer>(13, nb.getInteractionFlags().getDevicePointer());
-//                pairEnergyKernel.setArg<cu::Buffer>(14, nb.getInteractionFlags().getDevicePointer());
-//            }
-//        }
-//    }
-//    cu.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
-//    cu.executeKernel(perParticleValueKernel, cu.getPaddedNumAtoms());
-//    cu.executeKernel(pairEnergyKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
-//    cu.executeKernel(perParticleEnergyKernel, cu.getPaddedNumAtoms());
-//    if (needParameterGradient)
-//        cu.executeKernel(gradientChainRuleKernel, cu.getPaddedNumAtoms());
-//    return 0.0;
-//}
-//
-//void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context, const CustomGBForce& force) {
-//    cu.setAsCurrent();
-//    int numParticles = force.getNumParticles();
-//    if (numParticles != cu.getNumAtoms())
-//        throw OpenMMException("updateParametersInContext: The number of particles has changed");
-//    
-//    // Record the per-particle parameters.
-//    
-//    vector<vector<cl_float> > paramVector(numParticles);
-//    vector<double> parameters;
-//    for (int i = 0; i < numParticles; i++) {
-//        force.getParticleParameters(i, parameters);
-//        paramVector[i].resize(parameters.size());
-//        for (int j = 0; j < (int) parameters.size(); j++)
-//            paramVector[i][j] = (cl_float) parameters[j];
-//    }
-//    params->setParameterValues(paramVector);
-//    
-//    // Mark that the current reordering may be invalid.
-//    
-//    cu.invalidateMolecules();
-//}
 class CudaCustomExternalForceInfo : public CudaForceInfo {
 public:
@@ -3473,12 +3368,12 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
    if (force.getNumGlobalParameters() > 0)
        extraArgs << ", const float* __restrict__ globals";
    for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
-        const CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
+        CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
        extraArgs << ", const "+buffer.getType()+"* __restrict__ donor"+buffer.getName();
        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" donorParams"+cu.intToString(i+1)+" = donor"+buffer.getName()+"[index];\n");
    }
    for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
-        const CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
+        CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
        extraArgs << ", const "+buffer.getType()+"* __restrict__ acceptor"+buffer.getName();
        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" acceptorParams"+cu.intToString(i+1)+" = acceptor"+buffer.getName()+"[index];\n");
    }

--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
@@ -715,58 +715,58 @@ private:
    std::vector<void*> computeSumArgs, force1Args;
 };
-///**
+/**
-// * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
+ * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
-// */
+ */
-//class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
+class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
-//public:
+public:
-//    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomGBForceKernel(name, platform),
+    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomGBForceKernel(name, platform),
-//            hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
+            hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
-//            valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
+            valueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
-//    }
+    }
-//    ~CudaCalcCustomGBForceKernel();
+    ~CudaCalcCustomGBForceKernel();
-//    /**
+    /**
-//     * Initialize the kernel.
+     * Initialize the kernel.
-//     *
+     *
-//     * @param system     the System this kernel will be applied to
+     * @param system     the System this kernel will be applied to
-//     * @param force      the CustomGBForce this kernel will be used for
+     * @param force      the CustomGBForce this kernel will be used for
-//     */
+     */
-//    void initialize(const System& system, const CustomGBForce& force);
+    void initialize(const System& system, const CustomGBForce& force);
-//    /**
+    /**
-//     * Execute the kernel to calculate the forces and/or energy.
+     * Execute the kernel to calculate the forces and/or energy.
-//     *
+     *
-//     * @param context        the context in which to execute this kernel
+     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
+     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
+     * @param includeEnergy  true if the energy should be calculated
-//     * @return the potential energy due to the force
+     * @return the potential energy due to the force
-//     */
+     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//    /**
+    /**
-//     * Copy changed parameters over to a context.
+     * Copy changed parameters over to a context.
-//     *
+     *
-//     * @param context    the context to copy parameters to
+     * @param context    the context to copy parameters to
-//     * @param force      the CustomGBForce to copy the parameters from
+     * @param force      the CustomGBForce to copy the parameters from
-//     */
+     */
-//    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
+    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
-//private:
+private:
-//    bool hasInitializedKernels, needParameterGradient;
+    bool hasInitializedKernels, needParameterGradient;
-//    int maxTiles, numComputedValues;
+    int maxTiles, numComputedValues;
-//    CudaContext& cu;
+    CudaContext& cu;
-//    CudaParameterSet* params;
+    CudaParameterSet* params;
-//    CudaParameterSet* computedValues;
+    CudaParameterSet* computedValues;
-//    CudaParameterSet* energyDerivs;
+    CudaParameterSet* energyDerivs;
-//    CudaArray<cl_long>* longEnergyDerivs;
+    CudaArray* longEnergyDerivs;
-//    CudaArray<cl_float>* globals;
+    CudaArray* globals;
-//    CudaArray<cl_float>* valueBuffers;
+    CudaArray* valueBuffers;
-//    CudaArray<cl_long>* longValueBuffers;
+    CudaArray* tabulatedFunctionParams;
-//    CudaArray<mm_float4>* tabulatedFunctionParams;
+    std::vector<std::string> globalParamNames;
-//    std::vector<std::string> globalParamNames;
+    std::vector<float> globalParamValues;
-//    std::vector<cl_float> globalParamValues;
+    std::vector<CudaArray*> tabulatedFunctions;
-//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
+    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
-//    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
+    System& system;
-//    System& system;
+    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
-//    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
+    std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
-//};
+};
 /**
 * This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.

--- a/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.cpp
@@ -52,7 +52,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
    numForceThreadBlocks = 2*multiprocessors;
-    forceThreadBlockSize = 256;
+    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
 }
 CudaNonbondedUtilities::~CudaNonbondedUtilities() {
@@ -441,8 +441,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
-    string file;
+    CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
-    CUmodule program = context.createModule(context.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::nonbonded, replacements), defines);
    CUfunction kernel = context.getKernel(program, "computeNonbonded");
    // Set arguments to the Kernel.

--- a/platforms/cuda2/src/kernels/customGBChainRule.cu
+++ b/platforms/cuda2/src/kernels/customGBChainRule.cu
+#ifdef USE_CUTOFF
+if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
+#else
+if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#endif
+#ifdef USE_SYMMETRIC
+    real tempForce = 0;
+#else
+    real3 tempForce1 = make_real3(0);
+    real3 tempForce2 = make_real3(0);
+#endif
+    COMPUTE_FORCE
+#ifdef USE_SYMMETRIC
+    dEdR += tempForce*invR;
+#else
+    dEdR1 += tempForce1;
+    dEdR2 += tempForce2;
+#endif
+}
--- a/platforms/cuda2/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda2/src/kernels/customGBEnergyN2.cu
+#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0xFFFFFFFF)));
+#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
+#define TILE_SIZE 32
+typedef struct {
+    real4 posq;
+    real3 force;
+    ATOM_PARAMETER_DATA
+#ifdef NEED_PADDING
+    float padding;
+#endif
+} AtomData;
+/**
+ * Compute a force based on pair interactions.
+ */
+extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
+        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices,
+        const unsigned int* __restrict__ exclusionRowIndices,
+#ifdef USE_CUTOFF
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    unsigned int pos = warp*numTiles/totalWarps;
+    unsigned int end = (warp+1)*numTiles/totalWarps;
+#endif
+    real energy = 0;
+    unsigned int lasty = 0xFFFFFFFF;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
+    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        unsigned int x, y;
+        real3 force = make_real3(0);
+        DECLARE_ATOM1_DERIVATIVES
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            // Locate the exclusion data for this tile.
+#ifdef USE_EXCLUSIONS
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+#else
+            bool hasExclusions = false;
+#endif
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+                const unsigned int localAtomIndex = threadIdx.x;
+                localData[localAtomIndex].posq = posq1;
+                LOAD_LOCAL_PARAMETERS_FROM_1
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+#endif
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    int atom2 = tbx+j;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    real dEdR = 0;
+                    real tempEnergy = 0;
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                        COMPUTE_INTERACTION
+                        dEdR /= -r;
+                    }
+                    energy += 0.5f*tempEnergy;
+                    delta *= dEdR;
+                    force -= delta;
+#ifdef USE_CUTOFF
+                    }
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
+            }
+            else {
+                // This is an off-diagonal tile.
+                const unsigned int localAtomIndex = threadIdx.x;
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    localData[localAtomIndex].posq = posq[j];
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                }
+                localData[localAtomIndex].force = make_real3(0);
+                CLEAR_LOCAL_DERIVATIVES
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags == 0) {
+                    // No interactions in this tile.
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+#ifdef USE_EXCLUSIONS
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(excl & 0x1);
+#endif
+                        int atom2 = tbx+tj;
+                        real4 posq2 = localData[atom2].posq;
+                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+tj;
+                        real dEdR = 0;
+                        real tempEnergy = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta *= dEdR;
+                        force -= delta;
+                        atom2 = tbx+tj;
+                        localData[atom2].force += delta;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                        }
+#endif
+#ifdef USE_EXCLUSIONS
+                        excl >>= 1;
+#endif
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
+        }
+        lasty = y;
+        // Write results.
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
+            STORE_DERIVATIVES_1
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF)));
+            STORE_DERIVATIVES_2
+        }
+        pos++;
+    } while (pos < end);
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
--- a/platforms/cuda2/src/kernels/customGBEnergyPerParticle.cu
+++ b/platforms/cuda2/src/kernels/customGBEnergyPerParticle.cu
+/**
+ * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
+ */
+extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq
+        PARAMETER_ARGUMENTS) {
+    real energy = 0;
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        // Load the derivatives
+        LOAD_DERIVATIVES
+        // Now calculate the per-particle energy terms.
+        real4 pos = posq[index];
+        real3 force = make_real3(0, 0, 0);
+        COMPUTE_ENERGY
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
--- a/platforms/cuda2/src/kernels/customGBGradientChainRule.cu
+++ b/platforms/cuda2/src/kernels/customGBGradientChainRule.cu
+/**
+ * Compute chain rule terms for computed values that depend explicitly on particle coordinates.
+ */
+extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq
+        PARAMETER_ARGUMENTS) {
+    const real scale = RECIP((real) 0xFFFFFFFF);
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 pos = posq[index];
+        real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
+        COMPUTE_FORCES
+        forceBuffers[index] = (long long) (force.x*0xFFFFFFFF);
+        forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0xFFFFFFFF);
+        forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0xFFFFFFFF);
+    }
+}
--- a/platforms/cuda2/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda2/src/kernels/customGBValueN2.cu
+#define TILE_SIZE 32
+typedef struct {
+    real4 posq;
+    real value, temp;
+    ATOM_PARAMETER_DATA
+#ifdef NEED_PADDING
+    float padding;
+#endif
+} AtomData;
+/**
+ * Compute a value based on pair interactions.
+ */
+extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, unsigned long long* __restrict__ global_value,
+#ifdef USE_CUTOFF
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    unsigned int pos = warp*numTiles/totalWarps;
+    unsigned int end = (warp+1)*numTiles/totalWarps;
+#endif
+    real energy = 0;
+    unsigned int lasty = 0xFFFFFFFF;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
+    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        unsigned int x, y;
+        real value = 0;
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            // Locate the exclusion data for this tile.
+#ifdef USE_EXCLUSIONS
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+#else
+            bool hasExclusions = false;
+#endif
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+                const unsigned int localAtomIndex = threadIdx.x;
+                localData[localAtomIndex].posq = posq1;
+                LOAD_LOCAL_PARAMETERS_FROM_1
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+#endif
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    int atom2 = tbx+j;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    real tempValue1 = 0;
+                    real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#else
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#endif
+                        COMPUTE_VALUE
+                    }
+                    value += tempValue1;
+#ifdef USE_CUTOFF
+                    }
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
+            }
+            else {
+                // This is an off-diagonal tile.
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    localData[threadIdx.x].posq = posq[j];
+                    const unsigned int localAtomIndex = threadIdx.x;
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                }
+                localData[threadIdx.x].value = 0;
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                int atom2 = tbx+j;
+                                real4 posq2 = localData[atom2].posq;
+                                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                                real tempValue1 = 0;
+                                real tempValue2 = 0;
+                                if (r2 < CUTOFF_SQUARED) {
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                                        COMPUTE_VALUE
+                                    }
+                                    value += tempValue1;
+                                }
+                                localData[threadIdx.x].temp = tempValue2;
+                                // Sum the forces on atom2.
+                                if (tgx % 4 == 0)
+                                    localData[threadIdx.x].temp += localData[threadIdx.x+1].temp+localData[threadIdx.x+2].temp+localData[threadIdx.x+3].temp;
+                                if (tgx == 0)
+                                    localData[tbx+j].value += localData[threadIdx.x].temp+localData[threadIdx.x+4].temp+localData[threadIdx.x+8].temp+localData[threadIdx.x+12].temp+localData[threadIdx.x+16].temp+localData[threadIdx.x+20].temp+localData[threadIdx.x+24].temp+localData[threadIdx.x+28].temp;
+                            }
+                        }
+                    }
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+#ifdef USE_EXCLUSIONS
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(excl & 0x1);
+#endif
+                        int atom2 = tbx+tj;
+                        real4 posq2 = localData[atom2].posq;
+                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+tj;
+                        real tempValue1 = 0;
+                        real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                        if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#else
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
+                            COMPUTE_VALUE
+                        }
+                        value += tempValue1;
+                        localData[tbx+tj].value += tempValue2;
+#ifdef USE_CUTOFF
+                        }
+#endif
+#ifdef USE_EXCLUSIONS
+                        excl >>= 1;
+#endif
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
+        }
+        // Write results.
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0xFFFFFFFF)));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0xFFFFFFFF)));
+        }
+        lasty = y;
+        pos++;
+    } while (pos < end);
+}
--- a/platforms/cuda2/src/kernels/customGBValuePerParticle.cu
+++ b/platforms/cuda2/src/kernels/customGBValuePerParticle.cu
+/**
+ * Reduce a pairwise computed value, and compute per-particle values.
+ */
+extern "C" __global__ void computePerParticleValues(real4* posq, long long* valueBuffers
+        PARAMETER_ARGUMENTS) {
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        // Load the pairwise value
+        real sum = valueBuffers[index]/(real) 0xFFFFFFFF;
+        // Now calculate other values
+        real4 pos = posq[index];
+        COMPUTE_VALUES
+    }
+}
--- a/platforms/cuda2/tests/TestCudaCustomGBForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomGBForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CUDA implementation of CustomGBForce.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "sfmt/SFMT.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/CustomGBForce.h"
+#include "openmm/GBSAOBCForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testOBC(GBSAOBCForce::NonbondedMethod obcMethod, CustomGBForce::NonbondedMethod customMethod) {
+    const int numMolecules = 70;
+    const int numParticles = numMolecules*2;
+    const double boxSize = 10.0;
+    CudaPlatform platform;
+    // Create two systems: one with a GBSAOBCForce, and one using a CustomGBForce to implement the same interaction.
+    System standardSystem;
+    System customSystem;
+    for (int i = 0; i < numParticles; i++) {
+        standardSystem.addParticle(1.0);
+        customSystem.addParticle(1.0);
+    }
+    standardSystem.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0.0, 0.0), Vec3(0.0, boxSize, 0.0), Vec3(0.0, 0.0, boxSize));
+    customSystem.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0.0, 0.0), Vec3(0.0, boxSize, 0.0), Vec3(0.0, 0.0, boxSize));
+    GBSAOBCForce* obc = new GBSAOBCForce();
+    CustomGBForce* custom = new CustomGBForce();
+    obc->setCutoffDistance(2.0);
+    custom->setCutoffDistance(2.0);
+    custom->addPerParticleParameter("q");
+    custom->addPerParticleParameter("radius");
+    custom->addPerParticleParameter("scale");
+    custom->addGlobalParameter("solventDielectric", obc->getSolventDielectric());
+    custom->addGlobalParameter("soluteDielectric", obc->getSoluteDielectric());
+    custom->addComputedValue("I", "step(r+sr2-or1)*0.5*(1/L-1/U+0.25*(1/U^2-1/L^2)*(r-sr2*sr2/r)+0.5*log(L/U)/r+C);"
+                                  "U=r+sr2;"
+                                  "C=2*(1/or1-1/L)*step(sr2-r-or1);"
+                                  "L=max(or1, D);"
+                                  "D=abs(r-sr2);"
+                                  "sr2 = scale2*or2;"
+                                  "or1 = radius1-0.009; or2 = radius2-0.009", CustomGBForce::ParticlePairNoExclusions);
+    custom->addComputedValue("B", "1/(1/or-tanh(1*psi-0.8*psi^2+4.85*psi^3)/radius);"
+                                  "psi=I*or; or=radius-0.009", CustomGBForce::SingleParticle);
+    custom->addEnergyTerm("28.3919551*(radius+0.14)^2*(radius/B)^6-0.5*138.935456*(1/soluteDielectric-1/solventDielectric)*q^2/B", CustomGBForce::SingleParticle);
+    custom->addEnergyTerm("-138.935456*(1/soluteDielectric-1/solventDielectric)*q1*q2/f;"
+                          "f=sqrt(r^2+B1*B2*exp(-r^2/(4*B1*B2)))", CustomGBForce::ParticlePairNoExclusions);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<double> params(3);
+    for (int i = 0; i < numMolecules; i++) {
+        if (i < numMolecules/2) {
+            obc->addParticle(1.0, 0.2, 0.5);
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.5;
+            custom->addParticle(params);
+            obc->addParticle(-1.0, 0.1, 0.5);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            custom->addParticle(params);
+        }
+        else {
+            obc->addParticle(1.0, 0.2, 0.8);
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.8;
+            custom->addParticle(params);
+            obc->addParticle(-1.0, 0.1, 0.8);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            custom->addParticle(params);
+        }
+        positions[2*i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+        positions[2*i+1] = Vec3(positions[2*i][0]+1.0, positions[2*i][1], positions[2*i][2]);
+        velocities[2*i] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        velocities[2*i+1] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+    }
+    obc->setNonbondedMethod(obcMethod);
+    custom->setNonbondedMethod(customMethod);
+    standardSystem.addForce(obc);
+    customSystem.addForce(custom);
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+    Context context1(standardSystem, integrator1, platform);
+    context1.setPositions(positions);
+    context1.setVelocities(velocities);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    Context context2(customSystem, integrator2, platform);
+    context2.setPositions(positions);
+    context2.setVelocities(velocities);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-4);
+    for (int i = 0; i < numParticles; i++) {
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-4);
+    }
+    // Try changing the particle parameters and make sure it's still correct.
+    for (int i = 0; i < numMolecules/2; i++) {
+        obc->setParticleParameters(2*i, 1.1, 0.3, 0.6);
+        params[0] = 1.1;
+        params[1] = 0.3;
+        params[2] = 0.6;
+        custom->setParticleParameters(2*i, params);
+        obc->setParticleParameters(2*i+1, -1.1, 0.2, 0.4);
+        params[0] = -1.1;
+        params[1] = 0.2;
+        params[2] = 0.4;
+        custom->setParticleParameters(2*i+1, params);
+    }
+    obc->updateParametersInContext(context1);
+    custom->updateParametersInContext(context2);
+    state1 = context1.getState(State::Forces | State::Energy);
+    state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-4);
+    for (int i = 0; i < numParticles; i++) {
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-4);
+    }
+}
+void testMembrane() {
+    const int numMolecules = 70;
+    const int numParticles = numMolecules*2;
+    const double boxSize = 10.0;
+    CudaPlatform platform;
+    // Create a system with an implicit membrane.
+    System system;
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+    }
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0.0, 0.0), Vec3(0.0, boxSize, 0.0), Vec3(0.0, 0.0, boxSize));
+    CustomGBForce* custom = new CustomGBForce();
+    custom->setCutoffDistance(2.0);
+    custom->addPerParticleParameter("q");
+    custom->addPerParticleParameter("radius");
+    custom->addPerParticleParameter("scale");
+    custom->addGlobalParameter("thickness", 3);
+    custom->addGlobalParameter("solventDielectric", 78.3);
+    custom->addGlobalParameter("soluteDielectric", 1);
+    custom->addComputedValue("Imol", "step(r+sr2-or1)*0.5*(1/L-1/U+0.25*(1/U^2-1/L^2)*(r-sr2*sr2/r)+0.5*log(L/U)/r+C);"
+                             "U=r+sr2;"
+                             "C=2*(1/or1-1/L)*step(sr2-r-or1);"
+                             "L=max(or1, D);"
+                             "D=abs(r-sr2);"
+                             "sr2 = scale2*or2;"
+                             "or1 = radius1-0.009; or2 = radius2-0.009", CustomGBForce::ParticlePairNoExclusions);
+    custom->addComputedValue("Imem", "(1/radius+2*log(2)/thickness)/(1+exp(7.2*(abs(z)+radius-0.5*thickness)))", CustomGBForce::SingleParticle);
+    custom->addComputedValue("B", "1/(1/or-tanh(1*psi-0.8*psi^2+4.85*psi^3)/radius);"
+                             "psi=max(Imol,Imem)*or; or=radius-0.009", CustomGBForce::SingleParticle);
+    custom->addEnergyTerm("28.3919551*(radius+0.14)^2*(radius/B)^6-0.5*138.935456*(1/soluteDielectric-1/solventDielectric)*q^2/B", CustomGBForce::SingleParticle);
+    custom->addEnergyTerm("-138.935456*(1/soluteDielectric-1/solventDielectric)*q1*q2/f;"
+                          "f=sqrt(r^2+B1*B2*exp(-r^2/(4*B1*B2)))", CustomGBForce::ParticlePairNoExclusions);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<double> params(3);
+    for (int i = 0; i < numMolecules; i++) {
+        if (i < numMolecules/2) {
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.5;
+            custom->addParticle(params);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            custom->addParticle(params);
+        }
+        else {
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.8;
+            custom->addParticle(params);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            custom->addParticle(params);
+        }
+        positions[2*i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+        positions[2*i+1] = Vec3(positions[2*i][0]+1.0, positions[2*i][1], positions[2*i][2]);
+        velocities[2*i] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        velocities[2*i+1] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+    }
+    system.addForce(custom);
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    // Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
+    double norm = 0.0;
+    for (int i = 0; i < (int) forces.size(); ++i)
+        norm += forces[i].dot(forces[i]);
+    norm = std::sqrt(norm);
+    const double stepSize = 1e-3;
+    double step = stepSize/norm;
+    for (int i = 0; i < (int) positions.size(); ++i) {
+        Vec3 p = positions[i];
+        Vec3 f = forces[i];
+        positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+    }
+    context.setPositions(positions);
+    State state2 = context.getState(State::Energy);
+    ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/stepSize, 1e-2);
+}
+void testTabulatedFunction() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomGBForce* force = new CustomGBForce();
+    force->addComputedValue("a", "0", CustomGBForce::ParticlePair);
+    force->addEnergyTerm("fn(r)+1", CustomGBForce::ParticlePair);
+    force->addParticle(vector<double>());
+    force->addParticle(vector<double>());
+    vector<double> table;
+    for (int i = 0; i < 21; i++)
+        table.push_back(std::sin(0.25*i));
+    force->addFunction("fn", table, 1.0, 6.0);
+    system.addForce(force);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    for (int i = 1; i < 30; i++) {
+        double x = (7.0/30.0)*i;
+        positions[1] = Vec3(x, 0, 0);
+        context.setPositions(positions);
+        State state = context.getState(State::Forces | State::Energy);
+        const vector<Vec3>& forces = state.getForces();
+        double force = (x < 1.0 || x > 6.0 ? 0.0 : -std::cos(x-1.0));
+        double energy = (x < 1.0 || x > 6.0 ? 0.0 : std::sin(x-1.0))+1.0;
+        ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], 0.1);
+        ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], 0.1);
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), 0.02);
+    }
+    for (int i = 1; i < 20; i++) {
+        double x = 0.25*i+1.0;
+        positions[1] = Vec3(x, 0, 0);
+        context.setPositions(positions);
+        State state = context.getState(State::Energy);
+        double energy = (x < 1.0 || x > 6.0 ? 0.0 : std::sin(x-1.0))+1.0;
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), 1e-4);
+    }
+}
+void testMultipleChainRules() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomGBForce* force = new CustomGBForce();
+    force->addComputedValue("a", "2*r", CustomGBForce::ParticlePair);
+    force->addComputedValue("b", "a+1", CustomGBForce::SingleParticle);
+    force->addComputedValue("c", "2*b+a", CustomGBForce::SingleParticle);
+    force->addEnergyTerm("0.1*a+1*b+10*c", CustomGBForce::SingleParticle); // 0.1*(2*r) + 2*r+1 + 10*(3*a+2) = 0.2*r + 2*r+1 + 40*r+20+20*r = 62.2*r+21
+    force->addParticle(vector<double>());
+    force->addParticle(vector<double>());
+    system.addForce(force);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    for (int i = 1; i < 5; i++) {
+        positions[1] = Vec3(i, 0, 0);
+        context.setPositions(positions);
+        State state = context.getState(State::Forces | State::Energy);
+        const vector<Vec3>& forces = state.getForces();
+        ASSERT_EQUAL_VEC(Vec3(124.4, 0, 0), forces[0], 1e-4);
+        ASSERT_EQUAL_VEC(Vec3(-124.4, 0, 0), forces[1], 1e-4);
+        ASSERT_EQUAL_TOL(2*(62.2*i+21), state.getPotentialEnergy(), 0.02);
+    }
+}
+void testPositionDependence() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomGBForce* force = new CustomGBForce();
+    force->addComputedValue("a", "r", CustomGBForce::ParticlePair);
+    force->addComputedValue("b", "a+x*y", CustomGBForce::SingleParticle);
+    force->addEnergyTerm("b*z", CustomGBForce::SingleParticle);
+    force->addEnergyTerm("b1+b2", CustomGBForce::ParticlePair); // = 2*r+x1*y1+x2*y2
+    force->addParticle(vector<double>());
+    force->addParticle(vector<double>());
+    system.addForce(force);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    vector<Vec3> forces(2);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < 5; i++) {
+        positions[0] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        positions[1] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        context.setPositions(positions);
+        State state = context.getState(State::Forces | State::Energy);
+        const vector<Vec3>& forces = state.getForces();
+        Vec3 delta = positions[0]-positions[1];
+        double r = sqrt(delta.dot(delta));
+        double energy = 2*r+positions[0][0]*positions[0][1]+positions[1][0]*positions[1][1];
+        for (int j = 0; j < 2; j++)
+            energy += positions[j][2]*(r+positions[j][0]*positions[j][1]);
+        Vec3 force1(-(1+positions[0][2])*delta[0]/r-(1+positions[0][2])*positions[0][1]-(1+positions[1][2])*delta[0]/r,
+                    -(1+positions[0][2])*delta[1]/r-(1+positions[0][2])*positions[0][0]-(1+positions[1][2])*delta[1]/r,
+                    -(1+positions[0][2])*delta[2]/r-(r+positions[0][0]*positions[0][1])-(1+positions[1][2])*delta[2]/r);
+        Vec3 force2((1+positions[0][2])*delta[0]/r+(1+positions[1][2])*delta[0]/r-(1+positions[1][2])*positions[1][1],
+                    (1+positions[0][2])*delta[1]/r+(1+positions[1][2])*delta[1]/r-(1+positions[1][2])*positions[1][0],
+                    (1+positions[0][2])*delta[2]/r+(1+positions[1][2])*delta[2]/r-(r+positions[1][0]*positions[1][1]));
+        ASSERT_EQUAL_VEC(force1, forces[0], 1e-4);
+        ASSERT_EQUAL_VEC(force2, forces[1], 1e-4);
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), 0.02);
+        // Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
+        double norm = 0.0;
+        for (int i = 0; i < (int) forces.size(); ++i)
+            norm += forces[i].dot(forces[i]);
+        norm = std::sqrt(norm);
+        const double stepSize = 1e-3;
+        double step = stepSize/norm;
+        for (int i = 0; i < (int) positions.size(); ++i) {
+            Vec3 p = positions[i];
+            Vec3 f = forces[i];
+            positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+        }
+        context.setPositions(positions);
+        State state2 = context.getState(State::Energy);
+        ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/stepSize, 1e-3*abs(state.getPotentialEnergy()));
+    }
+}
+void testExclusions() {
+    CudaPlatform platform;
+    for (int i = 0; i < 4; i++) {
+        System system;
+        system.addParticle(1.0);
+        system.addParticle(1.0);
+        VerletIntegrator integrator(0.01);
+        CustomGBForce* force = new CustomGBForce();
+        force->addComputedValue("a", "r", i < 2 ? CustomGBForce::ParticlePair : CustomGBForce::ParticlePairNoExclusions);
+        force->addEnergyTerm("a", CustomGBForce::SingleParticle);
+        force->addEnergyTerm("(1+a1+a2)*r", i%2 == 0 ? CustomGBForce::ParticlePair : CustomGBForce::ParticlePairNoExclusions);
+        force->addParticle(vector<double>());
+        force->addParticle(vector<double>());
+        force->addExclusion(0, 1);
+        system.addForce(force);
+        Context context(system, integrator, platform);
+        vector<Vec3> positions(2);
+        positions[0] = Vec3(0, 0, 0);
+        positions[1] = Vec3(1, 0, 0);
+        context.setPositions(positions);
+        State state = context.getState(State::Forces | State::Energy);
+        const vector<Vec3>& forces = state.getForces();
+        double f, energy;
+        switch (i)
+        {
+            case 0: // e = 0
+                f = 0;
+                energy = 0;
+                break;
+            case 1: // e = r
+                f = 1;
+                energy = 1;
+                break;
+            case 2: // e = 2r
+                f = 2;
+                energy = 2;
+                break;
+            case 3: // e = 3r + 2r^2
+                f = 7;
+                energy = 5;
+                break;
+            default:
+                ASSERT(false);
+        }
+        ASSERT_EQUAL_VEC(Vec3(f, 0, 0), forces[0], 1e-4);
+        ASSERT_EQUAL_VEC(Vec3(-f, 0, 0), forces[1], 1e-4);
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), 1e-4);
+        // Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
+        double norm = 0.0;
+        for (int i = 0; i < (int) forces.size(); ++i)
+            norm += forces[i].dot(forces[i]);
+        norm = std::sqrt(norm);
+        const double stepSize = 1e-3;
+        double step = stepSize/norm;
+        for (int i = 0; i < (int) positions.size(); ++i) {
+            Vec3 p = positions[i];
+            Vec3 f = forces[i];
+            positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+        }
+        context.setPositions(positions);
+        State state2 = context.getState(State::Energy);
+        ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/stepSize, 1e-3*abs(state.getPotentialEnergy()));
+    }
+}
+int main() {
+    try {
+        testOBC(GBSAOBCForce::NoCutoff, CustomGBForce::NoCutoff);
+        testOBC(GBSAOBCForce::CutoffNonPeriodic, CustomGBForce::CutoffNonPeriodic);
+        testOBC(GBSAOBCForce::CutoffPeriodic, CustomGBForce::CutoffPeriodic);
+        testMembrane();
+        testTabulatedFunction();
+        testMultipleChainRules();
+        testPositionDependence();
+        testExclusions();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}