Continuing to implement new CUDA platform: CustomNonbondedForce, CustomHbondForce, CustomIntegrator

bd22eada · Peter Eastman · 8eb6850d · bd22eada · bd22eada · bd22eada
Commit bd22eada authored Jun 20, 2012 by Peter Eastman
16 changed files
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
@@ -53,7 +53,7 @@ CudaArray::~CudaArray() {
    }
 }
-void CudaArray::upload(void* data, bool blocking) {
+void CudaArray::upload(const void* data, bool blocking) {
    CUresult result;
    if (blocking)
        result = cuMemcpyHtoD(pointer, data, size*elementSize);

--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
@@ -94,7 +94,7 @@ public:
     * Copy the values in a vector to the device memory.
     */
    template <class T>
-    void upload(std::vector<T>& data) {
+    void upload(const std::vector<T>& data) {
        if (sizeof(T) != elementSize || data.size() != size)
            throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
        upload(&data[0], true);
@@ -117,7 +117,7 @@ public:
     * @param blocking if true, this call will block until the transfer is complete.  If false,
     *                 the source array  must be in page-locked memory.
     */
-    void upload(void* data, bool blocking = true);
+    void upload(const void* data, bool blocking = true);
    /**
     * Copy the values in the device memory to an array.
     * 

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -945,6 +945,10 @@ void CudaContext::reorderAtoms(bool enforcePeriodic) {
        reorderListeners[i]->execute();
 }
+void CudaContext::addReorderListener(ReorderListener* listener) {
+    reorderListeners.push_back(listener);
+}
 struct CudaContext::WorkThread::ThreadData {
    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -94,16 +94,16 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcNonbondedForceKernel::Name())
        return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
-//    if (name == CalcCustomNonbondedForceKernel::Name())
+    if (name == CalcCustomNonbondedForceKernel::Name())
-//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
+        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcGBSAOBCForceKernel::Name())
 //        return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
 //    if (name == CalcCustomGBForceKernel::Name())
 //        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomExternalForceKernel::Name())
        return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
-//    if (name == CalcCustomHbondForceKernel::Name())
+    if (name == CalcCustomHbondForceKernel::Name())
-//        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
+        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCompoundBondForceKernel::Name())
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
    if (name == IntegrateVerletStepKernel::Name())
@@ -116,8 +116,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
    if (name == IntegrateVariableLangevinStepKernel::Name())
        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
-//    if (name == IntegrateCustomStepKernel::Name())
+    if (name == IntegrateCustomStepKernel::Name())
-//        return new CudaIntegrateCustomStepKernel(name, platform, cu);
+        return new CudaIntegrateCustomStepKernel(name, platform, cu);
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CudaApplyAndersenThermostatKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
@@ -623,50 +623,49 @@ private:
    static const int PmeOrder = 5;
 };
-///**
+/**
-// * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
+ * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
-// */
+ */
-//class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
+class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
-//public:
+public:
-//    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomNonbondedForceKernel(name, platform),
+    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomNonbondedForceKernel(name, platform),
-//            hasInitializedKernel(false), cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
+            cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
-//    }
+    }
-//    ~CudaCalcCustomNonbondedForceKernel();
+    ~CudaCalcCustomNonbondedForceKernel();
-//    /**
+    /**
-//     * Initialize the kernel.
+     * Initialize the kernel.
-//     *
+     *
-//     * @param system     the System this kernel will be applied to
+     * @param system     the System this kernel will be applied to
-//     * @param force      the CustomNonbondedForce this kernel will be used for
+     * @param force      the CustomNonbondedForce this kernel will be used for
-//     */
+     */
-//    void initialize(const System& system, const CustomNonbondedForce& force);
+    void initialize(const System& system, const CustomNonbondedForce& force);
-//    /**
+    /**
-//     * Execute the kernel to calculate the forces and/or energy.
+     * Execute the kernel to calculate the forces and/or energy.
-//     *
+     *
-//     * @param context        the context in which to execute this kernel
+     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
+     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
+     * @param includeEnergy  true if the energy should be calculated
-//     * @return the potential energy due to the force
+     * @return the potential energy due to the force
-//     */
+     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//    /**
+    /**
-//     * Copy changed parameters over to a context.
+     * Copy changed parameters over to a context.
-//     *
+     *
-//     * @param context    the context to copy parameters to
+     * @param context    the context to copy parameters to
-//     * @param force      the CustomNonbondedForce to copy the parameters from
+     * @param force      the CustomNonbondedForce to copy the parameters from
-//     */
+     */
-//    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
+    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
-//private:
+private:
-//    bool hasInitializedKernel;
+    CudaContext& cu;
-//    CudaContext& cu;
+    CudaParameterSet* params;
-//    CudaParameterSet* params;
+    CudaArray* globals;
-//    CudaArray<cl_float>* globals;
+    CudaArray* tabulatedFunctionParams;
-//    CudaArray<mm_float4>* tabulatedFunctionParams;
+    std::vector<std::string> globalParamNames;
-//    std::vector<std::string> globalParamNames;
+    std::vector<float> globalParamValues;
-//    std::vector<cl_float> globalParamValues;
+    std::vector<CudaArray*> tabulatedFunctions;
-//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
+    System& system;
-//    System& system;
+};
-//};
-//
 ///**
 // * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
 // */
@@ -814,60 +813,58 @@ private:
    std::vector<float> globalParamValues;
 };
-///**
+/**
-// * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
+ * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
-// */
+ */
-//class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
+class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
-//public:
+public:
-//    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomHbondForceKernel(name, platform),
+    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomHbondForceKernel(name, platform),
-//            hasInitializedKernel(false), cu(cu), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
+            hasInitializedKernel(false), cu(cu), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
-//            donorBufferIndices(NULL), acceptorBufferIndices(NULL), globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL),
+            globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL), tabulatedFunctionParams(NULL), system(system) {
-//            tabulatedFunctionParams(NULL), system(system) {
+    }
-//    }
+    ~CudaCalcCustomHbondForceKernel();
-//    ~CudaCalcCustomHbondForceKernel();
+    /**
-//    /**
+     * Initialize the kernel.
-//     * Initialize the kernel.
+     *
-//     *
+     * @param system     the System this kernel will be applied to
-//     * @param system     the System this kernel will be applied to
+     * @param force      the CustomHbondForce this kernel will be used for
-//     * @param force      the CustomHbondForce this kernel will be used for
+     */
-//     */
+    void initialize(const System& system, const CustomHbondForce& force);
-//    void initialize(const System& system, const CustomHbondForce& force);
+    /**
-//    /**
+     * Execute the kernel to calculate the forces and/or energy.
-//     * Execute the kernel to calculate the forces and/or energy.
+     *
-//     *
+     * @param context        the context in which to execute this kernel
-//     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
-//     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
-//     * @return the potential energy due to the force
+     */
-//     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
-//    /**
+     * Copy changed parameters over to a context.
-//     * Copy changed parameters over to a context.
+     *
-//     *
+     * @param context    the context to copy parameters to
-//     * @param context    the context to copy parameters to
+     * @param force      the CustomHbondForce to copy the parameters from
-//     * @param force      the CustomHbondForce to copy the parameters from
+     */
-//     */
+    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
-//    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
+private:
-//private:
+    int numDonors, numAcceptors;
-//    int numDonors, numAcceptors;
+    bool hasInitializedKernel;
-//    bool hasInitializedKernel;
+    CudaContext& cu;
-//    CudaContext& cu;
+    CudaParameterSet* donorParams;
-//    CudaParameterSet* donorParams;
+    CudaParameterSet* acceptorParams;
-//    CudaParameterSet* acceptorParams;
+    CudaArray* globals;
-//    CudaArray<cl_float>* globals;
+    CudaArray* donors;
-//    CudaArray<mm_int4>* donors;
+    CudaArray* acceptors;
-//    CudaArray<mm_int4>* acceptors;
+    CudaArray* donorExclusions;
-//    CudaArray<mm_int4>* donorBufferIndices;
+    CudaArray* acceptorExclusions;
-//    CudaArray<mm_int4>* acceptorBufferIndices;
+    CudaArray* tabulatedFunctionParams;
-//    CudaArray<mm_int4>* donorExclusions;
+    std::vector<std::string> globalParamNames;
-//    CudaArray<mm_int4>* acceptorExclusions;
+    std::vector<float> globalParamValues;
-//    CudaArray<mm_float4>* tabulatedFunctionParams;
+    std::vector<CudaArray*> tabulatedFunctions;
-//    std::vector<std::string> globalParamNames;
+    std::vector<void*> donorArgs, acceptorArgs;
-//    std::vector<cl_float> globalParamValues;
+    System& system;
-//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
+    CUfunction donorKernel, acceptorKernel;
-//    System& system;
+};
-//    CUfunction donorKernel, acceptorKernel;
-//};
 /**
 * This kernel is invoked by CustomCompoundBondForce to calculate the forces acting on the system.
@@ -1062,94 +1059,98 @@ private:
    double prevTemp, prevFriction, prevErrorTol;
 };
-///**
+/**
-// * This kernel is invoked by CustomIntegrator to take one time step.
+ * This kernel is invoked by CustomIntegrator to take one time step.
-// */
+ */
-//class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
+class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
-//public:
+public:
-//    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
+    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
-//            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
+            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
-//            uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
+            uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
-//    }
+    }
-//    ~CudaIntegrateCustomStepKernel();
+    ~CudaIntegrateCustomStepKernel();
-//    /**
+    /**
-//     * Initialize the kernel.
+     * Initialize the kernel.
-//     * 
+     * 
-//     * @param system     the System this kernel will be applied to
+     * @param system     the System this kernel will be applied to
-//     * @param integrator the CustomIntegrator this kernel will be used for
+     * @param integrator the CustomIntegrator this kernel will be used for
-//     */
+     */
-//    void initialize(const System& system, const CustomIntegrator& integrator);
+    void initialize(const System& system, const CustomIntegrator& integrator);
-//    /**
+    /**
-//     * Execute the kernel.
+     * Execute the kernel.
-//     * 
+     * 
-//     * @param context    the context in which to execute this kernel
+     * @param context    the context in which to execute this kernel
-//     * @param integrator the CustomIntegrator this kernel is being used for
+     * @param integrator the CustomIntegrator this kernel is being used for
-//     * @param forcesAreValid if the context has been modified since the last time step, this will be
+     * @param forcesAreValid if the context has been modified since the last time step, this will be
-//     *                       false to show that cached forces are invalid and must be recalculated.
+     *                       false to show that cached forces are invalid and must be recalculated.
-//     *                       On exit, this should specify whether the cached forces are valid at the
+     *                       On exit, this should specify whether the cached forces are valid at the
-//     *                       end of the step.
+     *                       end of the step.
-//     */
+     */
-//    void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
+    void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
-//    /**
+    /**
-//     * Get the values of all global variables.
+     * Get the values of all global variables.
-//     *
+     *
-//     * @param context   the context in which to execute this kernel
+     * @param context   the context in which to execute this kernel
-//     * @param values    on exit, this contains the values
+     * @param values    on exit, this contains the values
-//     */
+     */
-//    void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
+    void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
-//    /**
+    /**
-//     * Set the values of all global variables.
+     * Set the values of all global variables.
-//     *
+     *
-//     * @param context   the context in which to execute this kernel
+     * @param context   the context in which to execute this kernel
-//     * @param values    a vector containing the values
+     * @param values    a vector containing the values
-//     */
+     */
-//    void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
+    void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
-//    /**
+    /**
-//     * Get the values of a per-DOF variable.
+     * Get the values of a per-DOF variable.
-//     *
+     *
-//     * @param context   the context in which to execute this kernel
+     * @param context   the context in which to execute this kernel
-//     * @param variable  the index of the variable to get
+     * @param variable  the index of the variable to get
-//     * @param values    on exit, this contains the values
+     * @param values    on exit, this contains the values
-//     */
+     */
-//    void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
+    void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
-//    /**
+    /**
-//     * Set the values of a per-DOF variable.
+     * Set the values of a per-DOF variable.
-//     *
+     *
-//     * @param context   the context in which to execute this kernel
+     * @param context   the context in which to execute this kernel
-//     * @param variable  the index of the variable to get
+     * @param variable  the index of the variable to get
-//     * @param values    a vector containing the values
+     * @param values    a vector containing the values
-//     */
+     */
-//    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
+    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
-//private:
+private:
-//    class ReorderListener;
+    class ReorderListener;
-//    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
+    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
-//    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
+    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
-//    void recordChangedParameters(ContextImpl& context);
+    void recordChangedParameters(ContextImpl& context);
-//    CudaContext& cu;
+    CudaContext& cu;
-//    double prevStepSize;
+    double prevStepSize;
-//    int numGlobalVariables;
+    int numGlobalVariables;
-//    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
+    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
-//    mutable bool localValuesAreCurrent;
+    mutable bool localValuesAreCurrent;
-//    CudaArray<cl_float>* globalValues;
+    CudaArray* globalValues;
-//    CudaArray<cl_float>* contextParameterValues;
+    CudaArray* contextParameterValues;
-//    CudaArray<cl_float>* sumBuffer;
+    CudaArray* sumBuffer;
-//    CudaArray<cl_float>* energy;
+    CudaArray* energy;
-//    CudaArray<mm_float4>* uniformRandoms;
+    CudaArray* uniformRandoms;
-//    CudaArray<mm_int4>* randomSeed;
+    CudaArray* randomSeed;
-//    CudaParameterSet* perDofValues;
+    CudaParameterSet* perDofValues;
-//    mutable std::vector<std::vector<cl_float> > localPerDofValues;
+    mutable std::vector<std::vector<float> > localPerDofValuesFloat;
-//    std::vector<std::vector<CUfunction> > kernels;
+    mutable std::vector<std::vector<double> > localPerDofValuesDouble;
-//    CUfunction sumEnergyKernel, randomKernel;
+    std::vector<float> contextValuesFloat;
-//    std::vector<CustomIntegrator::ComputationType> stepType;
+    std::vector<double> contextValuesDouble;
-//    std::vector<bool> needsForces;
+    std::vector<std::vector<CUfunction> > kernels;
-//    std::vector<bool> needsEnergy;
+    std::vector<std::vector<std::vector<void*> > > kernelArgs;
-//    std::vector<bool> invalidatesForces;
+    CUfunction sumEnergyKernel, randomKernel;
-//    std::vector<bool> merged;
+    std::vector<CustomIntegrator::ComputationType> stepType;
-//    std::vector<int> forceGroup;
+    std::vector<bool> needsForces;
-//    std::vector<int> requiredGaussian;
+    std::vector<bool> needsEnergy;
-//    std::vector<int> requiredUniform;
+    std::vector<bool> invalidatesForces;
-//    std::vector<std::string> parameterNames;
+    std::vector<bool> merged;
-//};
+    std::vector<int> forceGroup;
+    std::vector<int> requiredGaussian;
+    std::vector<int> requiredUniform;
+    std::vector<std::string> parameterNames;
+};
 /**
 * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.

--- a/platforms/cuda2/src/CudaParameterSet.cpp
+++ b/platforms/cuda2/src/CudaParameterSet.cpp
@@ -39,11 +39,12 @@ using namespace std;
        throw OpenMMException(m.str());\
    }
-CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter) :
+CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
            context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
    int params = numParameters;
    int bufferCount = 0;
-    int elementSize = 4;
+    elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
+    string elementType = (useDoublePrecision ? "double" : "float");
    CUdeviceptr pointer;
    string errorMessage = "Error creating parameter set "+name;
    if (!bufferPerParameter) {
@@ -51,14 +52,14 @@ CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*4));
            std::stringstream name;
            name << "param" << (++bufferCount);
-            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 4, elementSize*4, pointer));
+            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, pointer));
            params -= 4;
        }
        if (params > 1) {
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*2));
            std::stringstream name;
            name << "param" << (++bufferCount);
-            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 2, elementSize*2, pointer));
+            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, pointer));
            params -= 2;
        }
    }
@@ -66,50 +67,55 @@ CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize));
        std::stringstream name;
        name << "param" << (++bufferCount);
-        buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 1, elementSize, pointer));
+        buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, pointer));
        params--;
    }
 }
 CudaParameterSet::~CudaParameterSet() {
-    string errorMessage = "Error freeing device memory";
+    if (context.getContextIsValid()) {
-    for (int i = 0; i < (int) buffers.size(); i++)
+        string errorMessage = "Error freeing device memory";
-        CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
+        for (int i = 0; i < (int) buffers.size(); i++)
+            CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
+    }
 }
-void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
+template <class T>
+void CudaParameterSet::getParameterValues(vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called getParameterValues() with vector of wrong type");
    values.resize(numObjects);
    for (int i = 0; i < numObjects; i++)
        values[i].resize(numParameters);
    int base = 0;
    string errorMessage = "Error downloading parameter set "+name;
    for (int i = 0; i < (int) buffers.size(); i++) {
-        if (buffers[i].getType() == "float4") {
+        if (buffers[i].getSize() == 4*elementSize) {
-            vector<float4> data(numObjects);
+            vector<T> data(4*numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++) {
-                values[j][base] = data[j].x;
+                values[j][base] = data[4*j];
                if (base+1 < numParameters)
-                    values[j][base+1] = data[j].y;
+                    values[j][base+1] = data[4*j+1];
                if (base+2 < numParameters)
-                    values[j][base+2] = data[j].z;
+                    values[j][base+2] = data[4*j+2];
                if (base+3 < numParameters)
-                    values[j][base+3] = data[j].w;
+                    values[j][base+3] = data[4*j+3];
            }
            base += 4;
        }
-        else if (buffers[i].getType() == "float2") {
+        else if (buffers[i].getSize() == 2*elementSize) {
-            vector<float2> data(numObjects);
+            vector<T> data(2*numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++) {
-                values[j][base] = data[j].x;
+                values[j][base] = data[2*j];
                if (base+1 < numParameters)
-                    values[j][base+1] = data[j].y;
+                    values[j][base+1] = data[2*j+1];
            }
            base += 2;
        }
-        else if (buffers[i].getType() == "float") {
+        else if (buffers[i].getSize() == elementSize) {
-            vector<float> data(numObjects);
+            vector<T> data(numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++)
                values[j][base] = data[j];
@@ -120,36 +126,39 @@ void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
    }
 }
-void CudaParameterSet::setParameterValues(const vector<vector<float> >& values) {
+template <class T>
+void CudaParameterSet::setParameterValues(const vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called setParameterValues() with vector of wrong type");
    int base = 0;
    string errorMessage = "Error uploading parameter set "+name;
    for (int i = 0; i < (int) buffers.size(); i++) {
-        if (buffers[i].getType() == "float4") {
+        if (buffers[i].getSize() == 4*elementSize) {
-            vector<float4> data(numObjects);
+            vector<T> data(4*numObjects);
            for (int j = 0; j < numObjects; j++) {
-                data[j].x = values[j][base];
+                data[4*j] = values[j][base];
                if (base+1 < numParameters)
-                    data[j].y = values[j][base+1];
+                    data[4*j+1] = values[j][base+1];
                if (base+2 < numParameters)
-                    data[j].z = values[j][base+2];
+                    data[4*j+2] = values[j][base+2];
                if (base+3 < numParameters)
-                    data[j].w = values[j][base+3];
+                    data[4*j+3] = values[j][base+3];
            }
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
            base += 4;
        }
-        else if (buffers[i].getType() == "float2") {
+        else if (buffers[i].getSize() == 2*elementSize) {
-            vector<float2> data(numObjects);
+            vector<T> data(2*numObjects);
            for (int j = 0; j < numObjects; j++) {
-                data[j].x = values[j][base];
+                data[2*j] = values[j][base];
                if (base+1 < numParameters)
-                    data[j].y = values[j][base+1];
+                    data[2*j+1] = values[j][base+1];
            }
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
            base += 2;
        }
-        else if (buffers[i].getType() == "float") {
+        else if (buffers[i].getSize() == elementSize) {
-            vector<float> data(numObjects);
+            vector<T> data(numObjects);
            for (int j = 0; j < numObjects; j++)
                data[j] = values[j][base];
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
@@ -164,16 +173,26 @@ string CudaParameterSet::getParameterSuffix(int index, const std::string& extraS
    const string suffixes[] = {".x", ".y", ".z", ".w"};
    int buffer = -1;
    for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
-        if (index*sizeof(float) < buffers[i].getSize())
+        if (index*elementSize < buffers[i].getSize())
            buffer = i;
        else
-            index -= buffers[i].getSize()/sizeof(float);
+            index -= buffers[i].getSize()/elementSize;
    }
    if (buffer == -1)
        throw OpenMMException("Internal error: Illegal argument to CudaParameterSet::getParameterSuffix() ("+name+")");
    stringstream suffix;
    suffix << (buffer+1) << extraSuffix;
-    if (buffers[buffer].getType() != "float")
+    if (buffers[buffer].getSize() != elementSize)
        suffix << suffixes[index];
    return suffix.str();
 }
+/**
+ * Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
+ */
+namespace OpenMM {
+template void CudaParameterSet::getParameterValues<float>(vector<vector<float> >& values);
+template void CudaParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
+template void CudaParameterSet::getParameterValues<double>(vector<vector<double> >& values);
+template void CudaParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
+}
\ No newline at end of file
--- a/platforms/cuda2/src/CudaParameterSet.h
+++ b/platforms/cuda2/src/CudaParameterSet.h
@@ -51,8 +51,9 @@ public:
     * @param name             the name of the parameter set
     * @param bufferPerParameter  if true, a separate buffer is created for each parameter.  If false,
     *                            multiple parameters may be combined into a single buffer.
+     * @param useDoublePrecision  whether values should be stored as single or double precision
     */
-    CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false);
+    CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
    ~CudaParameterSet();
    /**
     * Get the number of parameters.
@@ -71,13 +72,15 @@ public:
     *
     * @param values on exit, values[i][j] contains the value of parameter j for object i
     */
-    void getParameterValues(std::vector<std::vector<float> >& values);
+    template <class T>
+    void getParameterValues(std::vector<std::vector<T> >& values);
    /**
     * Set the values of all parameters.
     *
     * @param values values[i][j] contains the value of parameter j for object i
     */
-    void setParameterValues(const std::vector<std::vector<float> >& values);
+    template <class T>
+    void setParameterValues(const std::vector<std::vector<T> >& values);
    /**
     * Get a set of CudaNonbondedUtilities::ParameterInfo objects which describe the Buffers
     * containing the data.
@@ -95,8 +98,7 @@ public:
    std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
 private:
    CudaContext& context;
-    int numParameters;
+    int numParameters, numObjects, elementSize;
-    int numObjects;
    std::string name;
    std::vector<CudaNonbondedUtilities::ParameterInfo> buffers;
 };

--- a/platforms/cuda2/src/kernels/customHbondForce.cu
+++ b/platforms/cuda2/src/kernels/customHbondForce.cu
+/**
+ * Convert a real4 to a real3 by removing its last element.
+ */
+inline __device__ real3 trim(real4 v) {
+    return make_real3(v.x, v.y, v.z);
+}
+/**
+ * This does nothing, and just exists to simply the code generation.
+ */
+inline __device__ real3 trim(real3 v) {
+    return v;
+}
+/**
+ * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 delta(real4 vec1, real4 vec2) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
+    return result;
+}
+/**
+ * Compute the difference between two vectors, taking periodic boundary conditions into account
+ * and setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+#ifdef USE_PERIODIC
+    result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+    result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+    result.z -= floor(result.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
+    return result;
+}
+/**
+ * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
+ */
+inline __device__ real computeAngle(real4 vec1, real4 vec2) {
+    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
+    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
+    real angle;
+    if (cosine > 0.99f || cosine < -0.99f) {
+        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
+        real3 crossProduct = cross(vec1, vec2);
+        real scale = vec1.w*vec2.w;
+        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        if (cosine < 0.0f)
+            angle = M_PI-angle;
+    }
+    else
+       angle = acos(cosine);
+    return angle;
+}
+/**
+ * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
+    real3 result = cross(vec1, vec2);
+    return make_real4(result.x, result.y, result.z, result.x*result.x + result.y*result.y + result.z*result.z);
+}
+/**
+ * Compute forces on donors.
+ */
+extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize
+        PARAMETER_ARGUMENTS) {
+    extern __shared__ real4 posBuffer[];
+    real energy = 0;
+    real3 f1 = make_real3(0);
+    real3 f2 = make_real3(0);
+    real3 f3 = make_real3(0);
+    for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x*gridDim.x) {
+        // Load information about the donor this thread will compute forces on.
+        int donorIndex = donorStart+blockIdx.x*blockDim.x+threadIdx.x;
+        int4 atoms, exclusionIndices;
+        real4 d1, d2, d3;
+        if (donorIndex < NUM_DONORS) {
+            atoms = donorAtoms[donorIndex];
+            d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
+            d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
+            d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
+#ifdef USE_EXCLUSIONS
+            exclusionIndices = exclusions[donorIndex];
+#endif
+        }
+        else
+            atoms = make_int4(-1, -1, -1, -1);
+        for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
+            // Load the next block of acceptors into local memory.
+            int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
+            if (threadIdx.x < blockSize) {
+                int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
+                posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
+                posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
+                posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
+            }
+            __syncthreads();
+            if (donorIndex < NUM_DONORS) {
+                for (int index = 0; index < blockSize; index++) {
+#ifdef USE_EXCLUSIONS
+                    int acceptorIndex = acceptorStart+index;
+                    if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
+                        continue;
+#endif
+                    // Compute the interaction between a donor and an acceptor.
+                    real4 a1 = posBuffer[3*index];
+                    real4 a2 = posBuffer[3*index+1];
+                    real4 a3 = posBuffer[3*index+2];
+                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
+#ifdef USE_CUTOFF
+                    if (deltaD1A1.w < CUTOFF_SQUARED) {
+#endif
+                        COMPUTE_DONOR_FORCE
+#ifdef USE_CUTOFF
+                    }
+#endif
+                }
+            }
+        }
+        // Write results
+        if (donorIndex < NUM_DONORS) {
+            if (atoms.x > -1) {
+                atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.y > -1) {
+                atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.z > -1) {
+                atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+        }
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
+/**
+ * Compute forces on acceptors.
+ */
+extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize
+        PARAMETER_ARGUMENTS) {
+    extern __shared__ real4 posBuffer[];
+    real3 f1 = make_real3(0);
+    real3 f2 = make_real3(0);
+    real3 f3 = make_real3(0);
+    for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x*gridDim.x) {
+        // Load information about the acceptor this thread will compute forces on.
+        int acceptorIndex = acceptorStart+blockIdx.x*blockDim.x+threadIdx.x;
+        int4 atoms, exclusionIndices;
+        real4 a1, a2, a3;
+        if (acceptorIndex < NUM_ACCEPTORS) {
+            atoms = acceptorAtoms[acceptorIndex];
+            a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
+            a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
+            a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
+#ifdef USE_EXCLUSIONS
+            exclusionIndices = exclusions[acceptorIndex];
+#endif
+        }
+        else
+            atoms = make_int4(-1, -1, -1, -1);
+        for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
+            // Load the next block of donors into local memory.
+            int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
+            if (threadIdx.x < blockSize) {
+                int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
+                posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
+                posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
+                posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
+            }
+            __syncthreads();
+            if (acceptorIndex < NUM_ACCEPTORS) {
+                for (int index = 0; index < blockSize; index++) {
+#ifdef USE_EXCLUSIONS
+                    int donorIndex = donorStart+index;
+                    if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
+                        continue;
+#endif
+                    // Compute the interaction between a donor and an acceptor.
+                    real4 d1 = posBuffer[3*index];
+                    real4 d2 = posBuffer[3*index+1];
+                    real4 d3 = posBuffer[3*index+2];
+                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
+#ifdef USE_CUTOFF
+                    if (deltaD1A1.w < CUTOFF_SQUARED) {
+#endif
+                        COMPUTE_ACCEPTOR_FORCE
+#ifdef USE_CUTOFF
+                    }
+#endif
+                }
+            }
+        }
+        // Write results
+        if (acceptorIndex < NUM_ACCEPTORS) {
+            if (atoms.x > -1) {
+                atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.y > -1) {
+                atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.z > -1) {
+                atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+        }
+    }
+}
--- a/platforms/cuda2/src/kernels/customIntegrator.cu
+++ b/platforms/cuda2/src/kernels/customIntegrator.cu
+extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real* result) {
+    __shared__ real tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = threadIdx.x;
+    real sum = 0;
+    for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
+        sum += sumBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
+}
+extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posDelta) {
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 position = posq[index];
+        position.x += posDelta[index].x;
+        position.y += posDelta[index].y;
+        position.z += posDelta[index].z;
+        posq[index] = position;
+        posDelta[index] = make_real4(0, 0, 0, 0);
+    }
+}
+extern "C" __global__ void generateRandomNumbers(float4* __restrict__ random, uint4* __restrict__ seed) {
+    uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x];
+    unsigned int carry = 0;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        // Generate three uniform random numbers.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        unsigned int m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x2 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        // Record the values.
+        random[index] = make_float4(x1, x2, x3, 0.0f);
+    }
+    seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
+}
--- a/platforms/cuda2/src/kernels/customIntegratorGlobal.cu
+++ b/platforms/cuda2/src/kernels/customIntegratorGlobal.cu
+extern "C" __global__ void computeGlobal(real2* __restrict__ dt, real* __restrict__ globals, real* __restrict__ params,
+        float uniform, float gaussian, const real* __restrict__ energy) {
+    COMPUTE_STEP
+}
--- a/platforms/cuda2/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda2/src/kernels/customIntegratorPerDof.cu
+inline __device__ double4 convertToDouble4(real4 a) {
+    return make_double4(a.x, a.y, a.z, a.w);
+}
+inline __device__ real4 convertFromDouble4(double4 a) {
+    return make_real4(a.x, a.y, a.z, a.w);
+}
+extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posDelta, real4* __restrict__ velm,
+        const long long* __restrict__ force, const real2* __restrict__ dt, const real* __restrict__ globals,
+        const real* __restrict__ params, real* __restrict__ sum, const float4* __restrict__ gaussianValues,
+        unsigned int randomIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
+        PARAMETER_ARGUMENTS) {
+    real stepSize = dt[0].y;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    randomIndex += index;
+    const double forceScale = 1.0/0xFFFFFFFF;
+    while (index < NUM_ATOMS) {
+#ifdef LOAD_POS_AS_DELTA
+        double4 position = convertToDouble4(posq[index]+posDelta[index]);
+#else
+        double4 position = convertToDouble4(posq[index]);
+#endif
+        double4 velocity = convertToDouble4(velm[index]);
+        double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);
+        double mass = 1.0/velocity.w;
+        if (velocity.w != 0.0) {
+            float4 gaussian = gaussianValues[randomIndex];
+            float4 uniform = uniformValues[index];
+            COMPUTE_STEP
+        }
+        randomIndex += blockDim.x*gridDim.x;
+        index += blockDim.x*gridDim.x;
+    }
+}
--- a/platforms/cuda2/src/kernels/customNonbonded.cu
+++ b/platforms/cuda2/src/kernels/customNonbonded.cu
+#ifdef USE_CUTOFF
+if (!isExcluded && r2 < CUTOFF_SQUARED) {
+#else
+if (!isExcluded) {
+#endif
+    real tempForce = 0;
+    COMPUTE_FORCE
+    dEdR += tempForce*invR;
+}
--- a/platforms/cuda2/tests/TestCudaCustomHbondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomHbondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CUDA implementation of CustomHbondForce.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/CustomHbondForce.h"
+#include "openmm/HarmonicAngleForce.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/PeriodicTorsionForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testHbond() {
+    CudaPlatform platform;
+    // Create a system using a CustomHbondForce.
+    System customSystem;
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    CustomHbondForce* custom = new CustomHbondForce("0.5*kr*(distance(d1,a1)-r0)^2 + 0.5*ktheta*(angle(a1,d1,d2)-theta0)^2 + 0.5*kpsi*(angle(d1,a1,a2)-psi0)^2 + kchi*(1+cos(n*dihedral(a3,a2,a1,d1)-chi0))");
+    custom->addPerDonorParameter("r0");
+    custom->addPerDonorParameter("theta0");
+    custom->addPerDonorParameter("psi0");
+    custom->addPerAcceptorParameter("chi0");
+    custom->addPerAcceptorParameter("n");
+    custom->addGlobalParameter("kr", 0.4);
+    custom->addGlobalParameter("ktheta", 0.5);
+    custom->addGlobalParameter("kpsi", 0.6);
+    custom->addGlobalParameter("kchi", 0.7);
+    vector<double> parameters(3);
+    parameters[0] = 1.5;
+    parameters[1] = 1.7;
+    parameters[2] = 1.9;
+    custom->addDonor(1, 0, -1, parameters);
+    parameters.resize(2);
+    parameters[0] = 2.1;
+    parameters[1] = 2;
+    custom->addAcceptor(2, 3, 4, parameters);
+    custom->setCutoffDistance(10.0);
+    customSystem.addForce(custom);
+    // Create an identical system using HarmonicBondForce, HarmonicAngleForce, and PeriodicTorsionForce.
+    System standardSystem;
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    HarmonicBondForce* bond = new HarmonicBondForce();
+    bond->addBond(1, 2, 1.5, 0.4);
+    standardSystem.addForce(bond);
+    HarmonicAngleForce* angle = new HarmonicAngleForce();
+    angle->addAngle(0, 1, 2, 1.7, 0.5);
+    angle->addAngle(1, 2, 3, 1.9, 0.6);
+    standardSystem.addForce(angle);
+    PeriodicTorsionForce* torsion = new PeriodicTorsionForce();
+    torsion->addTorsion(1, 2, 3, 4, 2, 2.1, 0.7);
+    standardSystem.addForce(torsion);
+    // Set the atoms in various positions, and verify that both systems give identical forces and energy.
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(5);
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+    Context c1(customSystem, integrator1, platform);
+    Context c2(standardSystem, integrator2, platform);
+    for (int i = 0; i < 10; i++) {
+        for (int j = 0; j < (int) positions.size(); j++)
+            positions[j] = Vec3(2.0*genrand_real2(sfmt), 2.0*genrand_real2(sfmt), 2.0*genrand_real2(sfmt));
+        c1.setPositions(positions);
+        c2.setPositions(positions);
+        State s1 = c1.getState(State::Forces | State::Energy);
+        State s2 = c2.getState(State::Forces | State::Energy);
+        for (int i = 0; i < customSystem.getNumParticles(); i++)
+            ASSERT_EQUAL_VEC(s2.getForces()[i], s1.getForces()[i], TOL);
+        ASSERT_EQUAL_TOL(s2.getPotentialEnergy(), s1.getPotentialEnergy(), TOL);
+    }
+    // Try changing the parameters and make sure it's still correct.
+    parameters.resize(3);
+    parameters[0] = 1.4;
+    parameters[1] = 1.7;
+    parameters[2] = 1.9;
+    custom->setDonorParameters(0, 1, 0, -1, parameters);
+    parameters.resize(2);
+    parameters[0] = 2.2;
+    parameters[1] = 2;
+    custom->setAcceptorParameters(0, 2, 3, 4, parameters);
+    bond->setBondParameters(0, 1, 2, 1.4, 0.4);
+    torsion->setTorsionParameters(0, 1, 2, 3, 4, 2, 2.2, 0.7);
+    custom->updateParametersInContext(c1);
+    bond->updateParametersInContext(c2);
+    torsion->updateParametersInContext(c2);
+    State s1 = c1.getState(State::Forces | State::Energy);
+    State s2 = c2.getState(State::Forces | State::Energy);
+    for (int i = 0; i < customSystem.getNumParticles(); i++)
+        ASSERT_EQUAL_VEC(s2.getForces()[i], s1.getForces()[i], TOL);
+    ASSERT_EQUAL_TOL(s2.getPotentialEnergy(), s1.getPotentialEnergy(), TOL);
+}
+void testExclusions() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("(distance(d1,a1)-1)^2");
+    custom->addDonor(0, 1, -1, vector<double>());
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addAcceptor(2, 0, -1, vector<double>());
+    custom->addExclusion(1, 0);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(2, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-2, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.0, state.getPotentialEnergy(), TOL);
+}
+void testCutoff() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("(distance(d1,a1)-1)^2");
+    custom->addDonor(0, 1, -1, vector<double>());
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addAcceptor(2, 0, -1, vector<double>());
+    custom->setNonbondedMethod(CustomHbondForce::CutoffNonPeriodic);
+    custom->setCutoffDistance(2.5);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 3, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(2, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-2, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.0, state.getPotentialEnergy(), TOL);
+}
+void testCustomFunctions() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("foo(distance(d1,a1))");
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addDonor(2, 0, -1, vector<double>());
+    custom->addAcceptor(0, 1, -1, vector<double>());
+    vector<double> function(2);
+    function[0] = 0;
+    function[1] = 1;
+    custom->addFunction("foo", function, 0, 10);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0.1, 0.1, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, -0.1, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-0.1, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(0.1*2+0.1*2, state.getPotentialEnergy(), TOL);
+}
+int main() {
+    try {
+        testHbond();
+        testExclusions();
+        testCutoff();
+        testCustomFunctions();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cuda2/tests/TestCudaCustomIntegrator.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomIntegrator.cpp
--- a/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests all the different force terms in the CUDA implementation of CustomNonbondedForce.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "sfmt/SFMT.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/CustomNonbondedForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testSimpleExpression() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("-0.1*r^3");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    double force = 0.1*3*(2*2);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(-0.1*(2*2*2), state.getPotentialEnergy(), TOL);
+}
+void testParameters() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("scale*a*(r*b)^3; a=a1*a2; b=c+b1+b2");
+    forceField->addPerParticleParameter("a");
+    forceField->addPerParticleParameter("b");
+    forceField->addGlobalParameter("scale", 3.0);
+    forceField->addGlobalParameter("c", -1.0);
+    vector<double> params(2);
+    params[0] = 1.5;
+    params[1] = 2.0;
+    forceField->addParticle(params);
+    params[0] = 2.0;
+    params[1] = 3.0;
+    forceField->addParticle(params);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    context.setParameter("scale", 1.0);
+    context.setParameter("c", 0.0);
+    State state = context.getState(State::Forces | State::Energy);
+    vector<Vec3> forces = state.getForces();
+    double force = -3.0*3*5.0*(10*10);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(3.0*(10*10*10), state.getPotentialEnergy(), TOL);
+    // Try changing the global parameters and make sure it's still correct.
+    context.setParameter("scale", 1.5);
+    context.setParameter("c", 1.0);
+    state = context.getState(State::Forces | State::Energy);
+    forces = state.getForces();
+    force = -1.5*3.0*3*6.0*(12*12);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(1.5*3.0*(12*12*12), state.getPotentialEnergy(), TOL);
+    // Try changing the per-particle parameters and make sure it's still correct.
+    params[0] = 1.6;
+    params[1] = 2.1;
+    forceField->setParticleParameters(0, params);
+    params[0] = 1.9;
+    params[1] = 2.8;
+    forceField->setParticleParameters(1, params);
+    forceField->updateParametersInContext(context);
+    state = context.getState(State::Forces | State::Energy);
+    forces = state.getForces();
+    force = -1.5*1.6*1.9*3*5.9*(11.8*11.8);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(1.5*1.6*1.9*(11.8*11.8*11.8), state.getPotentialEnergy(), TOL);
+}
+void testManyParameters() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("(a1*a2+b1*b2+c1*c2+d1*d2+e1*e2)*r");
+    forceField->addPerParticleParameter("a");
+    forceField->addPerParticleParameter("b");
+    forceField->addPerParticleParameter("c");
+    forceField->addPerParticleParameter("d");
+    forceField->addPerParticleParameter("e");
+    vector<double> params(5);
+    params[0] = 1.0;
+    params[1] = 2.0;
+    params[2] = 3.0;
+    params[3] = 4.0;
+    params[4] = 5.0;
+    forceField->addParticle(params);
+    params[0] = 1.1;
+    params[1] = 1.2;
+    params[2] = 1.3;
+    params[3] = 1.4;
+    params[4] = 1.5;
+    forceField->addParticle(params);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    vector<Vec3> forces = state.getForces();
+    double force = 1*1.1 + 2*1.2 + 3*1.3 + 4*1.4 + 5*1.5;
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(2*force, state.getPotentialEnergy(), TOL);
+}
+void testExclusions() {
+    CudaPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* nonbonded = new CustomNonbondedForce("a*r; a=a1+a2");
+    nonbonded->addPerParticleParameter("a");
+    vector<double> params(1);
+    vector<Vec3> positions(4);
+    for (int i = 0; i < 4; i++) {
+        system.addParticle(1.0);
+        params[0] = i+1;
+        nonbonded->addParticle(params);
+        positions[i] = Vec3(i, 0, 0);
+    }
+    nonbonded->addExclusion(0, 1);
+    nonbonded->addExclusion(1, 2);
+    nonbonded->addExclusion(2, 3);
+    nonbonded->addExclusion(0, 2);
+    nonbonded->addExclusion(1, 3);
+    system.addForce(nonbonded);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(1+4, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-(1+4), 0, 0), forces[3], TOL);
+    ASSERT_EQUAL_TOL((1+4)*3.0, state.getPotentialEnergy(), TOL);
+}
+void testCutoff() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("r");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    forceField->setCutoffDistance(2.5);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(0, 3, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0, 1, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, -1, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(2.0+1.0, state.getPotentialEnergy(), TOL);
+}
+void testPeriodic() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("r");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
+    forceField->setCutoffDistance(2.0);
+    system.setDefaultPeriodicBoxVectors(Vec3(4, 0, 0), Vec3(0, 4, 0), Vec3(0, 0, 4));
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2.1, 0);
+    positions[2] = Vec3(0, 3, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0, -2, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 2, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.9+1+0.9, state.getPotentialEnergy(), TOL);
+}
+void testTabulatedFunction() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("fn(r)+1");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    vector<double> table;
+    for (int i = 0; i < 21; i++)
+        table.push_back(std::sin(0.25*i));
+    forceField->addFunction("fn", table, 1.0, 6.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    double tol = 0.01;
+    for (int i = 1; i < 30; i++) {
+        double x = (7.0/30.0)*i;
+        positions[1] = Vec3(x, 0, 0);
+        context.setPositions(positions);
+        State state = context.getState(State::Forces | State::Energy);
+        const vector<Vec3>& forces = state.getForces();
+        double force = (x < 1.0 || x > 6.0 ? 0.0 : -std::cos(x-1.0));
+        double energy = (x < 1.0 || x > 6.0 ? 0.0 : std::sin(x-1.0))+1.0;
+        ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], 0.1);
+        ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], 0.1);
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), 0.02);
+    }
+    for (int i = 1; i < 20; i++) {
+        double x = 0.25*i+1.0;
+        positions[1] = Vec3(x, 0, 0);
+        context.setPositions(positions);
+        State state = context.getState(State::Energy);
+        double energy = (x < 1.0 || x > 6.0 ? 0.0 : std::sin(x-1.0))+1.0;
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), 1e-4);
+    }
+}
+void testCoulombLennardJones() {
+    const int numMolecules = 300;
+    const int numParticles = numMolecules*2;
+    const double boxSize = 20.0;
+    CudaPlatform platform;
+    // Create two systems: one with a NonbondedForce, and one using a CustomNonbondedForce to implement the same interaction.
+    System standardSystem;
+    System customSystem;
+    for (int i = 0; i < numParticles; i++) {
+        standardSystem.addParticle(1.0);
+        customSystem.addParticle(1.0);
+    }
+    NonbondedForce* standardNonbonded = new NonbondedForce();
+    CustomNonbondedForce* customNonbonded = new CustomNonbondedForce("4*eps*((sigma/r)^12-(sigma/r)^6)+138.935456*q/r; q=q1*q2; sigma=0.5*(sigma1+sigma2); eps=sqrt(eps1*eps2)");
+    customNonbonded->addPerParticleParameter("q");
+    customNonbonded->addPerParticleParameter("sigma");
+    customNonbonded->addPerParticleParameter("eps");
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<double> params(3);
+    for (int i = 0; i < numMolecules; i++) {
+        if (i < numMolecules/2) {
+            standardNonbonded->addParticle(1.0, 0.2, 0.1);
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.1;
+            customNonbonded->addParticle(params);
+            standardNonbonded->addParticle(-1.0, 0.1, 0.1);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            customNonbonded->addParticle(params);
+        }
+        else {
+            standardNonbonded->addParticle(1.0, 0.2, 0.2);
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.2;
+            customNonbonded->addParticle(params);
+            standardNonbonded->addParticle(-1.0, 0.1, 0.2);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            customNonbonded->addParticle(params);
+        }
+        positions[2*i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+        positions[2*i+1] = Vec3(positions[2*i][0]+1.0, positions[2*i][1], positions[2*i][2]);
+        velocities[2*i] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        velocities[2*i+1] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        standardNonbonded->addException(2*i, 2*i+1, 0.0, 1.0, 0.0);
+        customNonbonded->addExclusion(2*i, 2*i+1);
+    }
+    standardNonbonded->setNonbondedMethod(NonbondedForce::NoCutoff);
+    customNonbonded->setNonbondedMethod(CustomNonbondedForce::NoCutoff);
+    standardSystem.addForce(standardNonbonded);
+    customSystem.addForce(customNonbonded);
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+    Context context1(standardSystem, integrator1, platform);
+    Context context2(customSystem, integrator2, platform);
+    context1.setPositions(positions);
+    context2.setPositions(positions);
+    context1.setVelocities(velocities);
+    context2.setVelocities(velocities);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-4);
+    for (int i = 0; i < numParticles; i++) {
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-4);
+    }
+}
+void testParallelComputation() {
+    CudaPlatform platform;
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomNonbondedForce* force = new CustomNonbondedForce("4*eps*((sigma/r)^12-(sigma/r)^6); sigma=0.5; eps=1");
+    vector<double> params;
+    for (int i = 0; i < numParticles; i++)
+        force->addParticle(params);
+    system.addForce(force);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(5*genrand_real2(sfmt), 5*genrand_real2(sfmt), 5*genrand_real2(sfmt));
+    for (int i = 0; i < numParticles; ++i)
+        for (int j = 0; j < i; ++j) {
+            Vec3 delta = positions[i]-positions[j];
+            if (delta.dot(delta) < 0.1)
+                force->addExclusion(i, j);
+        }
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, CudaPlatform::CudaDeviceIndex());
+    map<string, string> props;
+    props[CudaPlatform::CudaDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+int main() {
+    try {
+        testSimpleExpression();
+        testParameters();
+        testManyParameters();
+        testExclusions();
+        testCutoff();
+        testPeriodic();
+        testTabulatedFunction();
+        testCoulombLennardJones();
+//        testParallelComputation();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}