Created CUDA implementation of CustomBondForce

bcbd7dc1 · Peter Eastman · 0de2ac88 · bcbd7dc1 · bcbd7dc1 · bcbd7dc1
Commit bcbd7dc1 authored Nov 12, 2009 by Peter Eastman
13 changed files
--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -39,6 +39,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaUpdateStateDataKernel(name, platform, data);
    if (name == CalcHarmonicBondForceKernel::Name())
        return new CudaCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
+    if (name == CalcCustomBondForceKernel::Name())
+        return new CudaCalcCustomBondForceKernel(name, platform, data, context.getSystem());
    if (name == CalcHarmonicAngleForceKernel::Name())
        return new CudaCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
    if (name == CalcPeriodicTorsionForceKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -202,6 +202,53 @@ double CudaCalcHarmonicBondForceKernel::executeEnergy(ContextImpl& context) {
    return 0.0;
 }
+CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
+}
+void CudaCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
+    numBonds = force.getNumBonds();
+    vector<int> particle1(numBonds);
+    vector<int> particle2(numBonds);
+    vector<vector<double> > params(numBonds);
+    for (int i = 0; i < numBonds; i++)
+        force.getBondParameters(i, particle1[i], particle2[i], params[i]);
+    vector<string> paramNames;
+    for (int i = 0; i < force.getNumPerBondParameters(); i++)
+        paramNames.push_back(force.getPerBondParameterName(i));
+    globalParamNames.resize(force.getNumGlobalParameters());
+    globalParamValues.resize(force.getNumGlobalParameters());
+    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        globalParamNames[i] = force.getGlobalParameterName(i);
+        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
+    }
+    gpuSetCustomBondParameters(data.gpu, particle1, particle2, params, force.getEnergyFunction(), paramNames, globalParamNames);
+    if (globalParamValues.size() > 0)
+        SetCustomBondGlobalParams(&globalParamValues[0]);
+}
+void CudaCalcCustomBondForceKernel::executeForces(ContextImpl& context) {
+    updateGlobalParams(context);
+    kCalculateCustomBondForces(data.gpu);
+}
+double CudaCalcCustomBondForceKernel::executeEnergy(ContextImpl& context) {
+    updateGlobalParams(context);
+    kCalculateCustomBondForces(data.gpu);
+    return 0.0;
+}
+void CudaCalcCustomBondForceKernel::updateGlobalParams(ContextImpl& context) {
+    bool changed = false;
+    for (int i = 0; i < (int) globalParamNames.size(); i++) {
+        float value = (float) context.getParameter(globalParamNames[i]);
+        if (value != globalParamValues[i])
+            changed = true;
+        globalParamValues[i] = value;
+    }
+    if (changed)
+        SetCustomBondGlobalParams(&globalParamValues[0]);
+}
 CudaCalcHarmonicAngleForceKernel::~CudaCalcHarmonicAngleForceKernel() {
 }
@@ -672,6 +719,8 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan
    _gpuContext* gpu = data.gpu;
    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
    gpuInitializeRandoms(gpu);
+    prevTemp = -1.0;
+    prevFriction = -1.0;
    prevStepSize = -1.0;
 }
@@ -714,6 +763,8 @@ void CudaIntegrateBrownianStepKernel::initialize(const System& system, const Bro
    _gpuContext* gpu = data.gpu;
    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
    gpuInitializeRandoms(gpu);
+    prevTemp = -1.0;
+    prevFriction = -1.0;
    prevStepSize = -1.0;
 }
@@ -788,6 +839,8 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c
    _gpuContext* gpu = data.gpu;
    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
    gpuInitializeRandoms(gpu);
+    prevTemp = -1.0;
+    prevFriction = -1.0;
    prevErrorTol = -1.0;
 }
@@ -834,6 +887,8 @@ void CudaApplyAndersenThermostatKernel::initialize(const System& system, const A
    _gpuContext* gpu = data.gpu;
    gpu->seed = (unsigned long) thermostat.getRandomNumberSeed();
    gpuInitializeRandoms(gpu);
+    prevTemp = -1.0;
+    prevFrequency = -1.0;
    prevStepSize = -1.0;
 }

--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -184,6 +184,44 @@ private:
    System& system;
 };
+/**
+ * This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
+public:
+    CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomBondForceKernel(name, platform),
+            data(data), system(system) {
+    }
+    ~CudaCalcCustomBondForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomBondForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomBondForce& force);
+    /**
+     * Execute the kernel to calculate the forces.
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     *
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the CustomBondForce
+     */
+    double executeEnergy(ContextImpl& context);
+private:
+    void updateGlobalParams(ContextImpl& context);
+    int numBonds;
+    CudaPlatform::PlatformData& data;
+    std::vector<std::string> globalParamNames;
+    std::vector<float> globalParamValues;
+    System& system;
+};
 /**
 * This kernel is invoked by HarmonicAngleForce to calculate the forces acting on the system and the energy of the system.
 */
@@ -428,7 +466,6 @@ private:
 class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
 public:
    CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVerletStepKernel(name, platform), data(data) {
-       prevStepSize = -1.0;
    }
    ~CudaIntegrateVerletStepKernel();
    /**
@@ -456,9 +493,6 @@ private:
 class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
 public:
    CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateLangevinStepKernel(name, platform), data(data) {
-       prevTemp     = -1.0;
-       prevFriction = -1.0;
-       prevStepSize = -1.0;
    }
    ~CudaIntegrateLangevinStepKernel();
    /**
@@ -486,9 +520,6 @@ private:
 class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
 public:
    CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateBrownianStepKernel(name, platform), data(data) {
-       prevTemp     = -1.0;
-       prevFriction = -1.0;
-       prevStepSize = -1.0;
    }
    ~CudaIntegrateBrownianStepKernel();
    /**
@@ -516,7 +547,6 @@ private:
 class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
 public:
    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVariableVerletStepKernel(name, platform), data(data) {
-       prevErrorTol = -1.0;
    }
    ~CudaIntegrateVariableVerletStepKernel();
    /**
@@ -545,9 +575,6 @@ private:
 class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
 public:
    CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVariableLangevinStepKernel(name, platform), data(data) {
-       prevTemp     = -1.0;
-       prevFriction = -1.0;
-       prevErrorTol = -1.0;
    }
    ~CudaIntegrateVariableLangevinStepKernel();
    /**
@@ -576,9 +603,6 @@ private:
 class CudaApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
 public:
    CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : ApplyAndersenThermostatKernel(name, platform), data(data) {
-       prevTemp      = -1.0;
-       prevFrequency = -1.0;
-       prevStepSize  = -1.0;
    }
    ~CudaApplyAndersenThermostatKernel();
    /**

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -49,6 +49,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);

--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -42,6 +42,7 @@ extern void kGenerateRandoms(gpuContext gpu);
 extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
 extern void kCalculateCDLJGBVIForces1(gpuContext gpu);
 extern void kCalculateCDLJForces(gpuContext gpu);
+extern void kCalculateCustomBondForces(gpuContext gpu);
 extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
 extern void kReduceObcGbsaBornForces(gpuContext gpu);
 extern void kCalculateObcGbsaForces2(gpuContext gpu);
@@ -73,6 +74,8 @@ extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
 extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
 extern void SetCalculateCDLJForcesSim(gpuContext gpu);
 extern void GetCalculateCDLJForcesSim(gpuContext gpu);
+extern void SetCalculateCustomBondForcesSim(gpuContext gpu);
+extern void GetCalculateCustomBondForcesSim(gpuContext gpu);
 extern void SetCalculateCustomNonbondedForcesSim(gpuContext gpu);
 extern void GetCalculateCustomNonbondedForcesSim(gpuContext gpu);
 extern void SetCalculateLocalForcesSim(gpuContext gpu);
@@ -105,6 +108,9 @@ extern void SetBrownianUpdateSim(gpuContext gpu);
 extern void GetBrownianUpdateSim(gpuContext gpu);
 extern void SetRandomSim(gpuContext gpu);
 extern void GetRandomSim(gpuContext gpu);
+extern void SetCustomBondForceExpression(const Expression<128>& expression);
+extern void SetCustomBondEnergyExpression(const Expression<128>& expression);
+extern void SetCustomBondGlobalParams(float* paramValues);
 extern void SetCustomNonbondedForceExpression(const Expression<128>& expression);
 extern void SetCustomNonbondedEnergyExpression(const Expression<128>& expression);
 extern void SetCustomNonbondedCombiningRules(const Expression<64>* expressions);

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -356,6 +356,10 @@ struct cudaGmxSimulation {
    float4*         pCustomExceptionParams;         // Parameters for custom nonbonded exceptions
    unsigned int    customExceptions;               // Number of custom nonbonded exceptions
    unsigned int    customParameters;               // Number of parameters for custom nonbonded interactions
+    int4*           pCustomBondID;                  // Atom indices for custom bonds
+    float4*         pCustomBondParams;              // Parameters for custom bonds
+    unsigned int    customBonds;                    // Number of custom bonds
+    unsigned int    customBondParameters;           // Number of parameters for custom bonds
    float4*         pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function
    float4*         pTabulatedFunctionParams;       // The min, max, and spacing for each tabulated function
    float2*         pEwaldCosSinSum;                // Pointer to the cos/sin sums (ewald)

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -644,6 +644,53 @@ void gpuSetTabulatedFunction(gpuContext gpu, int index, const string& name, cons
    coeff->Upload();
 }
+extern "C"
+void gpuSetCustomBondParameters(gpuContext gpu, const vector<int>& bondAtom1, const vector<int>& bondAtom2, const vector<vector<double> >& bondParams,
+            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
+{
+    if (paramNames.size() > 4)
+        throw OpenMMException("CudaPlatform only supports four per-bond parameters for custom bond forces");
+    if (globalParamNames.size() > 8)
+        throw OpenMMException("CudaPlatform only supports eight global parameters for custom bond forces");
+    if (gpu->psCustomBondID != NULL)
+        throw OpenMMException("CudaPlatform only supports a single CustomBondForce per System");
+    gpu->sim.customBonds = bondAtom1.size();
+    gpu->sim.customBondParameters = paramNames.size();
+    gpu->psCustomBondID = new CUDAStream<int4>(gpu->sim.customBonds, 1, "CustomBondId");
+    gpu->sim.pCustomBondID = gpu->psCustomBondID->_pDevData;
+    gpu->psCustomBondParams = new CUDAStream<float4>(gpu->sim.customBonds, 1, "CustomBondParams");
+    gpu->sim.pCustomBondParams = gpu->psCustomBondParams->_pDevData;
+    vector<int> forceBufferCounter(gpu->natoms, 0);
+    for (int i = 0; i < (int) bondAtom1.size(); i++) {
+        (*gpu->psCustomBondID)[i].x = bondAtom1[i];
+        (*gpu->psCustomBondID)[i].y = bondAtom2[i];
+        (*gpu->psCustomBondID)[i].z = forceBufferCounter[bondAtom1[i]]++;
+        (*gpu->psCustomBondID)[i].w = forceBufferCounter[bondAtom2[i]]++;
+        if (bondParams[i].size() > 0)
+            (*gpu->psCustomBondParams)[i].x = bondParams[i][0];
+        if (bondParams[i].size() > 1)
+            (*gpu->psCustomBondParams)[i].y = bondParams[i][1];
+        if (bondParams[i].size() > 2)
+            (*gpu->psCustomBondParams)[i].z = bondParams[i][2];
+        if (bondParams[i].size() > 3)
+            (*gpu->psCustomBondParams)[i].w = bondParams[i][3];
+    }
+    gpu->psCustomBondID->Upload();
+    gpu->psCustomBondParams->Upload();
+    for (int i = 0; i < (int) forceBufferCounter.size(); i++)
+        if (forceBufferCounter[i] > gpu->pOutputBufferCounter[i])
+            gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
+    // Create the Expressions.
+    vector<string> variables;
+    variables.push_back("r");
+    for (int i = 0; i < (int) paramNames.size(); i++)
+        variables.push_back(paramNames[i]);
+    SetCustomBondEnergyExpression(createExpression<128>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+    SetCustomBondForceExpression(createExpression<128>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("r").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+}
 extern "C"
 void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double> >& parameters, const vector<vector<int> >& exclusions,
            const vector<int>& exceptionAtom1, const vector<int>& exceptionAtom2, const vector<vector<double> >& exceptionParams,
@@ -740,7 +787,6 @@ void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double>
    variables.push_back("r");
    for (int i = 0; i < (int) paramNames.size(); i++)
        variables.push_back(paramNames[i]);
-    gpu->sim.customExpressionStackSize = 0;
    SetCustomNonbondedEnergyExpression(createExpression<128>(gpu, energyExp, Lepton::Parser::parse(energyExp, functions).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
    SetCustomNonbondedForceExpression(createExpression<128>(gpu, energyExp, Lepton::Parser::parse(energyExp, functions).differentiate("r").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
    Expression<64> paramExpressions[4];
@@ -1765,6 +1811,8 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
    gpu->psCustomParams             = NULL;
    gpu->psCustomExceptionID        = NULL;
    gpu->psCustomExceptionParams    = NULL;
+    gpu->psCustomBondID             = NULL;
+    gpu->psCustomBondParams         = NULL;
    gpu->psEwaldCosSinSum           = NULL;
    gpu->psTabulatedErfc            = NULL;
    gpu->psPmeGrid                  = NULL;
@@ -1801,6 +1849,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
    gpu->psTabulatedFunctionParams  = NULL;
    for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++)
        gpu->tabulatedFunctions[i].coefficients = NULL;
+    gpu->sim.customExpressionStackSize = 0;
    // Initialize output buffer before reading parameters
    gpu->pOutputBufferCounter       = new unsigned int[gpu->sim.paddedNumberOfAtoms];
@@ -1931,6 +1980,10 @@ void gpuShutDown(gpuContext gpu)
        delete gpu->psCustomExceptionID;
        delete gpu->psCustomExceptionParams;
    }
+    if (gpu->psCustomBondParams != NULL) {
+        delete gpu->psCustomBondID;
+        delete gpu->psCustomBondParams;
+    }
    if (gpu->psEwaldCosSinSum != NULL)
        delete gpu->psEwaldCosSinSum;
    if (gpu->psPmeGrid != NULL) {
@@ -2052,7 +2105,7 @@ int gpuBuildOutputBuffers(gpuContext gpu)
    gpu->sim.dihedral_offset    = gpu->sim.bond_angle_offset     + gpu->psDihedralParameter->_stride;
    gpu->sim.rb_dihedral_offset = gpu->sim.dihedral_offset       + gpu->psRbDihedralParameter1->_stride;
    gpu->sim.LJ14_offset        = gpu->sim.rb_dihedral_offset    + gpu->psLJ14Parameter->_stride;
-    gpu->sim.localForces_threads_per_block  = (gpu->sim.LJ14_offset / gpu->sim.blocks + 15) & 0xfffffff0;
+    gpu->sim.localForces_threads_per_block  = (std::max(gpu->sim.LJ14_offset, gpu->sim.customBonds) / gpu->sim.blocks + 15) & 0xfffffff0;
    if (gpu->sim.localForces_threads_per_block > gpu->sim.max_localForces_threads_per_block)
        gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
    if (gpu->sim.localForces_threads_per_block < 1)
@@ -2296,6 +2349,7 @@ int gpuSetConstants(gpuContext gpu)
    SetCalculateCDLJForcesSim(gpu);
    SetCalculateCDLJObcGbsaForces1Sim(gpu);
    SetCalculateCustomNonbondedForcesSim(gpu);
+    SetCalculateCustomBondForcesSim(gpu);
    SetCalculateLocalForcesSim(gpu);
    SetCalculateObcGbsaBornSumSim(gpu);
    SetCalculateGBVIBornSumSim(gpu);

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -104,6 +104,8 @@ struct _gpuContext {
    CUDAStream<float4>* psCustomParams;     // Atom parameters for custom nonbonded force
    CUDAStream<int4>* psCustomExceptionID;  // Atom indices for custom nonbonded exceptions
    CUDAStream<float4>* psCustomExceptionParams; // Parameters for custom nonbonded exceptions
+    CUDAStream<int4>* psCustomBondID;             // Atom indices for custom bonds
+    CUDAStream<float4>* psCustomBondParams;       // Parameters for custom bonds
    CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
    CUDAStream<float2>* psEwaldCosSinSum;
    CUDAStream<float>* psTabulatedErfc;     // Tabulated values for erfc()
@@ -206,6 +208,10 @@ void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDi
 extern "C"
 void gpuSetTabulatedFunction(gpuContext gpu, int index, const std::string& name, const std::vector<double>& values, double min, double max, bool interpolating);
+extern "C"
+void gpuSetCustomBondParameters(gpuContext gpu, const std::vector<int>& bondAtom1, const std::vector<int>& bondAtom2, const std::vector<std::vector<double> >& bondParams,
+            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
 extern "C"
 void gpuSetCustomNonbondedParameters(gpuContext gpu, const std::vector<std::vector<double> >& parameters, const std::vector<std::vector<int> >& exclusions,
            const std::vector<int>& exceptionAtom1, const std::vector<int>& exceptionAtom2, const std::vector<std::vector<double> >& exceptionParams,

--- a/platforms/cuda/src/kernels/kCalculateCustomBondForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCustomBondForces.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+#include "gputypes.h"
+#include "cudatypes.h"
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ Expression<128> forceExp;
+static __constant__ Expression<128> energyExp;
+#include "kEvaluateExpression.h"
+void SetCalculateCustomBondForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetCalculateCustomBondForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+void SetCustomBondForceExpression(const Expression<128>& expression)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
+    RTERROR(status, "SetCustomBondForceExpression: cudaMemcpyToSymbol failed");
+}
+void SetCustomBondEnergyExpression(const Expression<128>& expression)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
+    RTERROR(status, "SetCustomBondEnergyExpression: cudaMemcpyToSymbol failed");
+}
+void SetCustomBondGlobalParams(float* paramValues)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(globalParams, paramValues, sizeof(globalParams));
+    RTERROR(status, "SetCustomBondGlobalParams: cudaMemcpyToSymbol failed");
+}
+__global__ void kCalculateCustomBondForces_kernel()
+{
+    extern __shared__ float stack[];
+    float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    float totalEnergy = 0.0f;
+    while (pos < cSim.customBonds)
+    {
+        int4 atom       = cSim.pCustomBondID[pos];
+        float4 params   = cSim.pCustomBondParams[pos];
+        float4 a1       = cSim.pPosq[atom.x];
+        float4 a2       = cSim.pPosq[atom.y];
+        float dx        = a1.x - a2.x;
+        float dy        = a1.y - a2.y;
+        float dz        = a1.z - a2.z;
+        float r         = sqrt(dx*dx + dy*dy + dz*dz);
+        float invR      = 1.0f/r;
+        VARIABLE(0)     = r;
+        VARIABLE(1)     = params.x;
+        VARIABLE(2)     = params.y;
+        VARIABLE(3)     = params.z;
+        VARIABLE(4)     = params.w;
+        float dEdR      = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
+        float energy    = kEvaluateExpression_kernel(&energyExp, stack, variables);
+        totalEnergy          += energy;
+        dx                   *= dEdR;
+        dy                   *= dEdR;
+        dz                   *= dEdR;
+        unsigned int offsetA  = atom.x + atom.z * cSim.stride;
+        unsigned int offsetB  = atom.y + atom.w * cSim.stride;
+        float4 forceA         = cSim.pForce4[offsetA];
+        float4 forceB         = cSim.pForce4[offsetB];
+        forceA.x             += dx;
+        forceA.y             += dy;
+        forceA.z             += dz;
+        forceB.x             -= dx;
+        forceB.y             -= dy;
+        forceB.z             -= dz;
+        cSim.pForce4[offsetA] = forceA;
+        cSim.pForce4[offsetB] = forceB;
+        pos                  += blockDim.x * gridDim.x;
+    }
+    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
+}
+void kCalculateCustomBondForces(gpuContext gpu)
+{
+//    printf("kCalculateCustomBondForces\n");
+    kCalculateCustomBondForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block,
+            gpu->sim.customExpressionStackSize*sizeof(float)*gpu->sim.localForces_threads_per_block>>>();
+    LAUNCHERROR("kCalculateCustomBondForces");
+}
--- a/platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.cu
@@ -53,12 +53,8 @@ static __constant__ cudaGmxSimulation cSim;
 static __constant__ Expression<128> forceExp;
 static __constant__ Expression<128> energyExp;
 static __constant__ Expression<64> combiningRules[4];
-static __constant__ float globalParams[8];
-texture<float4, 1, cudaReadModeElementType> texRef0;
+#include "kEvaluateExpression.h"
-texture<float4, 1, cudaReadModeElementType> texRef1;
-texture<float4, 1, cudaReadModeElementType> texRef2;
-texture<float4, 1, cudaReadModeElementType> texRef3;
 void SetCalculateCustomNonbondedForcesSim(gpuContext gpu)
 {
@@ -102,156 +98,6 @@ void SetCustomNonbondedGlobalParams(float* paramValues)
    RTERROR(status, "SetCustomNonbondedGlobalParams: cudaMemcpyToSymbol failed");
 }
-#define STACK(y) stack[(y)*blockDim.x+threadIdx.x]
-#define VARIABLE(y) variables[(y)*blockDim.x+threadIdx.x]
-template<int SIZE>
-__device__ float kEvaluateExpression_kernel(Expression<SIZE>* expression, float* stack, float* variables)
-{
-    int stackPointer = -1;
-    for (int i = 0; i < expression->length; i++)
-    {
-        int op = expression->op[i];
-        if (op < MULTIPLY) {
-            STACK(++stackPointer) = VARIABLE(op-VARIABLE0);
-        }
-        else if (op < NEGATE) {
-            if (op < MULTIPLY_CONSTANT) {
-                if (op == MULTIPLY) {
-                    float temp = STACK(stackPointer);
-                    STACK(--stackPointer) *= temp;
-                }
-                else if (op == DIVIDE) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) = temp/STACK(--stackPointer);
-                }
-                else if (op == ADD) {
-                    float temp = STACK(stackPointer);
-                    STACK(--stackPointer) += temp;
-                }
-                else if (op == SUBTRACT) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) = temp-STACK(--stackPointer);
-                }
-                else /*if (op == POWER)*/ {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) = pow(temp, STACK(--stackPointer));
-                }
-            }
-            else if (op < GLOBAL) {
-                if (op == MULTIPLY_CONSTANT) {
-                    STACK(stackPointer) *= expression->arg[i];
-                }
-                else if (op == POWER_CONSTANT) {
-                    STACK(stackPointer) = pow(STACK(stackPointer), expression->arg[i]);
-                }
-                else /*if (op == ADD_CONSTANT)*/ {
-                    STACK(stackPointer) += expression->arg[i];
-                }
-            }
-            else {
-                if (op == GLOBAL) {
-                    STACK(++stackPointer) = globalParams[(int) expression->arg[i]];
-                }
-                else if (op == CONSTANT) {
-                    STACK(++stackPointer) = expression->arg[i];
-                }
-                else /*if (op == CUSTOM || op == CUSTOM_DERIV)*/ {
-                    int function = (int) expression->arg[i];
-                    float x = STACK(stackPointer);
-                    float4 params = cSim.pTabulatedFunctionParams[function];
-                    if (x < params.x || x > params.y)
-                        STACK(stackPointer) = 0.0f;
-                    else
-                    {
-                        int index = floor((x-params.x)*params.z);
-                        float4 coeff;
-                        if (function == 0)
-                            coeff = tex1Dfetch(texRef0, index);
-                        else if (function == 1)
-                            coeff = tex1Dfetch(texRef1, index);
-                        else if (function == 2)
-                            coeff = tex1Dfetch(texRef2, index);
-                        else
-                            coeff = tex1Dfetch(texRef3, index);
-                        x = (x-params.x)*params.z-index;
-                        if (op == CUSTOM)
-                            STACK(stackPointer) = coeff.x+x*(coeff.y+x*(coeff.z+x*coeff.w));
-                        else
-                            STACK(stackPointer) = (coeff.y+x*(2.0f*coeff.z+x*3.0f*coeff.w))*params.z;
-                    }
-                }
-            }
-        }
-        else {
-            if (op < SIN) {
-                if (op == NEGATE) {
-                    STACK(stackPointer) *= -1.0f;
-                }
-                else if (op == RECIPROCAL) {
-                    STACK(stackPointer) = 1.0f/STACK(stackPointer);
-                }
-                else if (op == SQRT) {
-                    STACK(stackPointer) = sqrt(STACK(stackPointer));
-                }
-                else if (op == EXP) {
-                    STACK(stackPointer) = exp(STACK(stackPointer));
-                }
-                else if (op == LOG) {
-                    STACK(stackPointer) = log(STACK(stackPointer));
-                }
-                else if (op == SQUARE) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) *= temp;
-                }
-                else /*if (op == CUBE)*/ {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) *= temp*temp;
-                }
-            }
-            else {
-                if (op == SIN) {
-                    STACK(stackPointer) = sin(STACK(stackPointer));
-                }
-                else if (op == COS) {
-                    STACK(stackPointer) = cos(STACK(stackPointer));
-                }
-                else if (op == SEC) {
-                    STACK(stackPointer) = 1.0f/cos(STACK(stackPointer));
-                }
-                else if (op == CSC) {
-                    STACK(stackPointer) = 1.0f/sin(STACK(stackPointer));
-                }
-                else if (op == TAN) {
-                    STACK(stackPointer) = tan(STACK(stackPointer));
-                }
-                else if (op == COT) {
-                    STACK(stackPointer) = 1.0f/tan(STACK(stackPointer));
-                }
-                else if (op == ASIN) {
-                    STACK(stackPointer) = asin(STACK(stackPointer));
-                }
-                else if (op == ACOS) {
-                    STACK(stackPointer) = acos(STACK(stackPointer));
-                }
-                else if (op == ATAN) {
-                    STACK(stackPointer) = atan(STACK(stackPointer));
-                }
-                else if (op == SINH) {
-                    STACK(stackPointer) = sinh(STACK(stackPointer));
-                }
-                else if (op == COSH) {
-                    STACK(stackPointer) = cosh(STACK(stackPointer));
-                }
-                else /*if (op == TANH)*/ {
-                    STACK(stackPointer) = tanh(STACK(stackPointer));
-                }
-            }
-        }
-    }
-    return STACK(stackPointer);
-}
 // Include versions of the kernels for N^2 calculations.
 #define METHOD_NAME(a, b) a##N2##b

--- a/platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.h
+++ b/platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.h
@@ -85,12 +85,12 @@ __global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned i
                VARIABLE(1) = params.y;
                VARIABLE(2) = params.z;
                VARIABLE(3) = params.w;
+                VARIABLE(4) = psA[j].params.x;
+                VARIABLE(5) = psA[j].params.y;
+                VARIABLE(6) = psA[j].params.z;
+                VARIABLE(7) = psA[j].params.w;
                for (int k = 0; k < cSim.customParameters; k++)
                {
-                    VARIABLE(4) = psA[j].params.x;
-                    VARIABLE(5) = psA[j].params.y;
-                    VARIABLE(6) = psA[j].params.z;
-                    VARIABLE(7) = psA[j].params.w;
                    float value = kEvaluateExpression_kernel(&combiningRules[k], stack, variables);
                    switch (k)
                    {
@@ -202,12 +202,12 @@ __global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned i
                        VARIABLE(1) = params.y;
                        VARIABLE(2) = params.z;
                        VARIABLE(3) = params.w;
+                        VARIABLE(4) = psA[tj].params.x;
+                        VARIABLE(5) = psA[tj].params.y;
+                        VARIABLE(6) = psA[tj].params.z;
+                        VARIABLE(7) = psA[tj].params.w;
                        for (int k = 0; k < cSim.customParameters; k++)
                        {
-                            VARIABLE(4) = psA[j].params.x;
-                            VARIABLE(5) = psA[j].params.y;
-                            VARIABLE(6) = psA[j].params.z;
-                            VARIABLE(7) = psA[j].params.w;
                            float value = kEvaluateExpression_kernel(&combiningRules[k], stack, variables);
                            switch (k)
                            {
@@ -281,12 +281,12 @@ __global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned i
                            VARIABLE(1) = params.y;
                            VARIABLE(2) = params.z;
                            VARIABLE(3) = params.w;
+                            VARIABLE(4) = psA[j].params.x;
+                            VARIABLE(5) = psA[j].params.y;
+                            VARIABLE(6) = psA[j].params.z;
+                            VARIABLE(7) = psA[j].params.w;
                            for (int k = 0; k < cSim.customParameters; k++)
                            {
-                                VARIABLE(4) = psA[j].params.x;
-                                VARIABLE(5) = psA[j].params.y;
-                                VARIABLE(6) = psA[j].params.z;
-                                VARIABLE(7) = psA[j].params.w;
                                float value = kEvaluateExpression_kernel(&combiningRules[k], stack, variables);
                                switch (k)
                                {
@@ -396,12 +396,12 @@ __global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned i
                    VARIABLE(1) = params.y;
                    VARIABLE(2) = params.z;
                    VARIABLE(3) = params.w;
+                    VARIABLE(4) = psA[tj].params.x;
+                    VARIABLE(5) = psA[tj].params.y;
+                    VARIABLE(6) = psA[tj].params.z;
+                    VARIABLE(7) = psA[tj].params.w;
                    for (int k = 0; k < cSim.customParameters; k++)
                    {
-                        VARIABLE(4) = psA[j].params.x;
-                        VARIABLE(5) = psA[j].params.y;
-                        VARIABLE(6) = psA[j].params.z;
-                        VARIABLE(7) = psA[j].params.w;
                        float value = kEvaluateExpression_kernel(&combiningRules[k], stack, variables);
                        switch (k)
                        {

--- a/platforms/cuda/src/kernels/kEvaluateExpression.h
+++ b/platforms/cuda/src/kernels/kEvaluateExpression.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+/**
+ * This file contains the routine for evaluating a custom expression.
+ */
+static __constant__ float globalParams[8];
+texture<float4, 1, cudaReadModeElementType> texRef0;
+texture<float4, 1, cudaReadModeElementType> texRef1;
+texture<float4, 1, cudaReadModeElementType> texRef2;
+texture<float4, 1, cudaReadModeElementType> texRef3;
+#define STACK(y) stack[(y)*blockDim.x+threadIdx.x]
+#define VARIABLE(y) variables[(y)*blockDim.x+threadIdx.x]
+template<int SIZE>
+__device__ float kEvaluateExpression_kernel(Expression<SIZE>* expression, float* stack, float* variables)
+{
+    int stackPointer = -1;
+    for (int i = 0; i < expression->length; i++)
+    {
+        int op = expression->op[i];
+        if (op < MULTIPLY) {
+            STACK(++stackPointer) = VARIABLE(op-VARIABLE0);
+        }
+        else if (op < NEGATE) {
+            if (op < MULTIPLY_CONSTANT) {
+                if (op == MULTIPLY) {
+                    float temp = STACK(stackPointer);
+                    STACK(--stackPointer) *= temp;
+                }
+                else if (op == DIVIDE) {
+                    float temp = STACK(stackPointer);
+                    STACK(stackPointer) = temp/STACK(--stackPointer);
+                }
+                else if (op == ADD) {
+                    float temp = STACK(stackPointer);
+                    STACK(--stackPointer) += temp;
+                }
+                else if (op == SUBTRACT) {
+                    float temp = STACK(stackPointer);
+                    STACK(stackPointer) = temp-STACK(--stackPointer);
+                }
+                else /*if (op == POWER)*/ {
+                    float temp = STACK(stackPointer);
+                    STACK(stackPointer) = pow(temp, STACK(--stackPointer));
+                }
+            }
+            else if (op < GLOBAL) {
+                if (op == MULTIPLY_CONSTANT) {
+                    STACK(stackPointer) *= expression->arg[i];
+                }
+                else if (op == POWER_CONSTANT) {
+                    STACK(stackPointer) = pow(STACK(stackPointer), expression->arg[i]);
+                }
+                else /*if (op == ADD_CONSTANT)*/ {
+                    STACK(stackPointer) += expression->arg[i];
+                }
+            }
+            else {
+                if (op == GLOBAL) {
+                    STACK(++stackPointer) = globalParams[(int) expression->arg[i]];
+                }
+                else if (op == CONSTANT) {
+                    STACK(++stackPointer) = expression->arg[i];
+                }
+                else /*if (op == CUSTOM || op == CUSTOM_DERIV)*/ {
+                    int function = (int) expression->arg[i];
+                    float x = STACK(stackPointer);
+                    float4 params = cSim.pTabulatedFunctionParams[function];
+                    if (x < params.x || x > params.y)
+                        STACK(stackPointer) = 0.0f;
+                    else
+                    {
+                        int index = floor((x-params.x)*params.z);
+                        float4 coeff;
+                        if (function == 0)
+                            coeff = tex1Dfetch(texRef0, index);
+                        else if (function == 1)
+                            coeff = tex1Dfetch(texRef1, index);
+                        else if (function == 2)
+                            coeff = tex1Dfetch(texRef2, index);
+                        else
+                            coeff = tex1Dfetch(texRef3, index);
+                        x = (x-params.x)*params.z-index;
+                        if (op == CUSTOM)
+                            STACK(stackPointer) = coeff.x+x*(coeff.y+x*(coeff.z+x*coeff.w));
+                        else
+                            STACK(stackPointer) = (coeff.y+x*(2.0f*coeff.z+x*3.0f*coeff.w))*params.z;
+                    }
+                }
+            }
+        }
+        else {
+            if (op < SIN) {
+                if (op == NEGATE) {
+                    STACK(stackPointer) *= -1.0f;
+                }
+                else if (op == RECIPROCAL) {
+                    STACK(stackPointer) = 1.0f/STACK(stackPointer);
+                }
+                else if (op == SQRT) {
+                    STACK(stackPointer) = sqrt(STACK(stackPointer));
+                }
+                else if (op == EXP) {
+                    STACK(stackPointer) = exp(STACK(stackPointer));
+                }
+                else if (op == LOG) {
+                    STACK(stackPointer) = log(STACK(stackPointer));
+                }
+                else if (op == SQUARE) {
+                    float temp = STACK(stackPointer);
+                    STACK(stackPointer) *= temp;
+                }
+                else /*if (op == CUBE)*/ {
+                    float temp = STACK(stackPointer);
+                    STACK(stackPointer) *= temp*temp;
+                }
+            }
+            else {
+                if (op == SIN) {
+                    STACK(stackPointer) = sin(STACK(stackPointer));
+                }
+                else if (op == COS) {
+                    STACK(stackPointer) = cos(STACK(stackPointer));
+                }
+                else if (op == SEC) {
+                    STACK(stackPointer) = 1.0f/cos(STACK(stackPointer));
+                }
+                else if (op == CSC) {
+                    STACK(stackPointer) = 1.0f/sin(STACK(stackPointer));
+                }
+                else if (op == TAN) {
+                    STACK(stackPointer) = tan(STACK(stackPointer));
+                }
+                else if (op == COT) {
+                    STACK(stackPointer) = 1.0f/tan(STACK(stackPointer));
+                }
+                else if (op == ASIN) {
+                    STACK(stackPointer) = asin(STACK(stackPointer));
+                }
+                else if (op == ACOS) {
+                    STACK(stackPointer) = acos(STACK(stackPointer));
+                }
+                else if (op == ATAN) {
+                    STACK(stackPointer) = atan(STACK(stackPointer));
+                }
+                else if (op == SINH) {
+                    STACK(stackPointer) = sinh(STACK(stackPointer));
+                }
+                else if (op == COSH) {
+                    STACK(stackPointer) = cosh(STACK(stackPointer));
+                }
+                else /*if (op == TANH)*/ {
+                    STACK(stackPointer) = tanh(STACK(stackPointer));
+                }
+            }
+        }
+    }
+    return STACK(stackPointer);
+}
--- a/platforms/cuda/tests/TestCudaCustomBondForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomBondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the Cuda implementation of CustomBondForce.
+ */
+#include "../../../tests/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/CustomBondForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testBonds() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomBondForce* forceField = new CustomBondForce("scale*k*(r-r0)^2");
+    forceField->addPerBondParameter("r0");
+    forceField->addPerBondParameter("k");
+    forceField->addGlobalParameter("scale", 0.5);
+    vector<double> parameters(2);
+    parameters[0] = 1.5;
+    parameters[1] = 0.8;
+    forceField->addBond(0, 1, parameters);
+    parameters[0] = 1.2;
+    parameters[1] = 0.7;
+    forceField->addBond(1, 2, parameters);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 2, 0);
+    positions[1] = Vec3(0, 0, 0);
+    positions[2] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0, -0.8*0.5, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0.7*0.2, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-forces[0][0]-forces[2][0], -forces[0][1]-forces[2][1], -forces[0][2]-forces[2][2]), forces[1], TOL);
+    ASSERT_EQUAL_TOL(0.5*0.8*0.5*0.5 + 0.5*0.7*0.2*0.2, state.getPotentialEnergy(), TOL);
+}
+int main() {
+    try {
+        testBonds();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}