Continuing to implement new CUDA platform: CustomNonbondedForce, CustomHbondForce, CustomIntegrator

bd22eada · Peter Eastman · 8eb6850d · bd22eada · bd22eada · bd22eada
Commit bd22eada authored Jun 20, 2012 by Peter Eastman
16 changed files
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
@@ -53,7 +53,7 @@ CudaArray::~CudaArray() {
    }
 }

-void CudaArray::upload(void* data, bool blocking) {
+void CudaArray::upload(const void* data, bool blocking) {
    CUresult result;
    if (blocking)
        result = cuMemcpyHtoD(pointer, data, size*elementSize);

--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
@@ -94,7 +94,7 @@ public:
     * Copy the values in a vector to the device memory.
     */
    template <class T>
-    void upload(std::vector<T>& data) {
+    void upload(const std::vector<T>& data) {
        if (sizeof(T) != elementSize || data.size() != size)
            throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
        upload(&data[0], true);
@@ -117,7 +117,7 @@ public:
     * @param blocking if true, this call will block until the transfer is complete.  If false,
     *                 the source array  must be in page-locked memory.
     */
-    void upload(void* data, bool blocking = true);
+    void upload(const void* data, bool blocking = true);
    /**
     * Copy the values in the device memory to an array.
     * 

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -945,6 +945,10 @@ void CudaContext::reorderAtoms(bool enforcePeriodic) {
        reorderListeners[i]->execute();
 }

+void CudaContext::addReorderListener(ReorderListener* listener) {
+    reorderListeners.push_back(listener);
+}
+
 struct CudaContext::WorkThread::ThreadData {
    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -94,16 +94,16 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcNonbondedForceKernel::Name())
        return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
-//    if (name == CalcCustomNonbondedForceKernel::Name())
-//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomNonbondedForceKernel::Name())
+        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcGBSAOBCForceKernel::Name())
 //        return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
 //    if (name == CalcCustomGBForceKernel::Name())
 //        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomExternalForceKernel::Name())
        return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
-//    if (name == CalcCustomHbondForceKernel::Name())
-//        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomHbondForceKernel::Name())
+        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCompoundBondForceKernel::Name())
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
    if (name == IntegrateVerletStepKernel::Name())
@@ -116,8 +116,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
    if (name == IntegrateVariableLangevinStepKernel::Name())
        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
-//    if (name == IntegrateCustomStepKernel::Name())
-//        return new CudaIntegrateCustomStepKernel(name, platform, cu);
+    if (name == IntegrateCustomStepKernel::Name())
+        return new CudaIntegrateCustomStepKernel(name, platform, cu);
    if (name == ApplyAndersenThermostatKernel::Name())
        return new CudaApplyAndersenThermostatKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
@@ -623,50 +623,49 @@ private:
    static const int PmeOrder = 5;
 };

-///**
-// * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
-// */
-//class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
-//public:
-//    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomNonbondedForceKernel(name, platform),
-//            hasInitializedKernel(false), cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
-//    }
-//    ~CudaCalcCustomNonbondedForceKernel();
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param force      the CustomNonbondedForce this kernel will be used for
-//     */
-//    void initialize(const System& system, const CustomNonbondedForce& force);
-//    /**
-//     * Execute the kernel to calculate the forces and/or energy.
-//     *
-//     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
-//     * @return the potential energy due to the force
-//     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//    /**
-//     * Copy changed parameters over to a context.
-//     *
-//     * @param context    the context to copy parameters to
-//     * @param force      the CustomNonbondedForce to copy the parameters from
-//     */
-//    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
-//private:
-//    bool hasInitializedKernel;
-//    CudaContext& cu;
-//    CudaParameterSet* params;
-//    CudaArray<cl_float>* globals;
-//    CudaArray<mm_float4>* tabulatedFunctionParams;
-//    std::vector<std::string> globalParamNames;
-//    std::vector<cl_float> globalParamValues;
-//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
-//    System& system;
-//};
-//
+/**
+ * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
+ */
+class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
+public:
+    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomNonbondedForceKernel(name, platform),
+            cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
+    }
+    ~CudaCalcCustomNonbondedForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomNonbondedForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomNonbondedForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomNonbondedForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
+private:
+    CudaContext& cu;
+    CudaParameterSet* params;
+    CudaArray* globals;
+    CudaArray* tabulatedFunctionParams;
+    std::vector<std::string> globalParamNames;
+    std::vector<float> globalParamValues;
+    std::vector<CudaArray*> tabulatedFunctions;
+    System& system;
+};
+
 ///**
 // * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
 // */
@@ -814,60 +813,58 @@ private:
    std::vector<float> globalParamValues;
 };

-///**
-// * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
-// */
-//class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
-//public:
-//    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomHbondForceKernel(name, platform),
-//            hasInitializedKernel(false), cu(cu), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
-//            donorBufferIndices(NULL), acceptorBufferIndices(NULL), globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL),
-//            tabulatedFunctionParams(NULL), system(system) {
-//    }
-//    ~CudaCalcCustomHbondForceKernel();
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param force      the CustomHbondForce this kernel will be used for
-//     */
-//    void initialize(const System& system, const CustomHbondForce& force);
-//    /**
-//     * Execute the kernel to calculate the forces and/or energy.
-//     *
-//     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
-//     * @return the potential energy due to the force
-//     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//    /**
-//     * Copy changed parameters over to a context.
-//     *
-//     * @param context    the context to copy parameters to
-//     * @param force      the CustomHbondForce to copy the parameters from
-//     */
-//    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
-//private:
-//    int numDonors, numAcceptors;
-//    bool hasInitializedKernel;
-//    CudaContext& cu;
-//    CudaParameterSet* donorParams;
-//    CudaParameterSet* acceptorParams;
-//    CudaArray<cl_float>* globals;
-//    CudaArray<mm_int4>* donors;
-//    CudaArray<mm_int4>* acceptors;
-//    CudaArray<mm_int4>* donorBufferIndices;
-//    CudaArray<mm_int4>* acceptorBufferIndices;
-//    CudaArray<mm_int4>* donorExclusions;
-//    CudaArray<mm_int4>* acceptorExclusions;
-//    CudaArray<mm_float4>* tabulatedFunctionParams;
-//    std::vector<std::string> globalParamNames;
-//    std::vector<cl_float> globalParamValues;
-//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
-//    System& system;
-//    CUfunction donorKernel, acceptorKernel;
-//};
+/**
+ * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
+ */
+class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
+public:
+    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomHbondForceKernel(name, platform),
+            hasInitializedKernel(false), cu(cu), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
+            globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL), tabulatedFunctionParams(NULL), system(system) {
+    }
+    ~CudaCalcCustomHbondForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomHbondForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomHbondForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomHbondForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
+private:
+    int numDonors, numAcceptors;
+    bool hasInitializedKernel;
+    CudaContext& cu;
+    CudaParameterSet* donorParams;
+    CudaParameterSet* acceptorParams;
+    CudaArray* globals;
+    CudaArray* donors;
+    CudaArray* acceptors;
+    CudaArray* donorExclusions;
+    CudaArray* acceptorExclusions;
+    CudaArray* tabulatedFunctionParams;
+    std::vector<std::string> globalParamNames;
+    std::vector<float> globalParamValues;
+    std::vector<CudaArray*> tabulatedFunctions;
+    std::vector<void*> donorArgs, acceptorArgs;
+    System& system;
+    CUfunction donorKernel, acceptorKernel;
+};

 /**
 * This kernel is invoked by CustomCompoundBondForce to calculate the forces acting on the system.
@@ -1062,94 +1059,98 @@ private:
    double prevTemp, prevFriction, prevErrorTol;
 };

-///**
-// * This kernel is invoked by CustomIntegrator to take one time step.
-// */
-//class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
-//public:
-//    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
-//            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
-//            uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
-//    }
-//    ~CudaIntegrateCustomStepKernel();
-//    /**
-//     * Initialize the kernel.
-//     * 
-//     * @param system     the System this kernel will be applied to
-//     * @param integrator the CustomIntegrator this kernel will be used for
-//     */
-//    void initialize(const System& system, const CustomIntegrator& integrator);
-//    /**
-//     * Execute the kernel.
-//     * 
-//     * @param context    the context in which to execute this kernel
-//     * @param integrator the CustomIntegrator this kernel is being used for
-//     * @param forcesAreValid if the context has been modified since the last time step, this will be
-//     *                       false to show that cached forces are invalid and must be recalculated.
-//     *                       On exit, this should specify whether the cached forces are valid at the
-//     *                       end of the step.
-//     */
-//    void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
-//    /**
-//     * Get the values of all global variables.
-//     *
-//     * @param context   the context in which to execute this kernel
-//     * @param values    on exit, this contains the values
-//     */
-//    void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
-//    /**
-//     * Set the values of all global variables.
-//     *
-//     * @param context   the context in which to execute this kernel
-//     * @param values    a vector containing the values
-//     */
-//    void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
-//    /**
-//     * Get the values of a per-DOF variable.
-//     *
-//     * @param context   the context in which to execute this kernel
-//     * @param variable  the index of the variable to get
-//     * @param values    on exit, this contains the values
-//     */
-//    void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
-//    /**
-//     * Set the values of a per-DOF variable.
-//     *
-//     * @param context   the context in which to execute this kernel
-//     * @param variable  the index of the variable to get
-//     * @param values    a vector containing the values
-//     */
-//    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
-//private:
-//    class ReorderListener;
-//    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
-//    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
-//    void recordChangedParameters(ContextImpl& context);
-//    CudaContext& cu;
-//    double prevStepSize;
-//    int numGlobalVariables;
-//    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
-//    mutable bool localValuesAreCurrent;
-//    CudaArray<cl_float>* globalValues;
-//    CudaArray<cl_float>* contextParameterValues;
-//    CudaArray<cl_float>* sumBuffer;
-//    CudaArray<cl_float>* energy;
-//    CudaArray<mm_float4>* uniformRandoms;
-//    CudaArray<mm_int4>* randomSeed;
-//    CudaParameterSet* perDofValues;
-//    mutable std::vector<std::vector<cl_float> > localPerDofValues;
-//    std::vector<std::vector<CUfunction> > kernels;
-//    CUfunction sumEnergyKernel, randomKernel;
-//    std::vector<CustomIntegrator::ComputationType> stepType;
-//    std::vector<bool> needsForces;
-//    std::vector<bool> needsEnergy;
-//    std::vector<bool> invalidatesForces;
-//    std::vector<bool> merged;
-//    std::vector<int> forceGroup;
-//    std::vector<int> requiredGaussian;
-//    std::vector<int> requiredUniform;
-//    std::vector<std::string> parameterNames;
-//};
+/**
+ * This kernel is invoked by CustomIntegrator to take one time step.
+ */
+class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
+public:
+    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
+            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
+            uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
+    }
+    ~CudaIntegrateCustomStepKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the CustomIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const CustomIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the CustomIntegrator this kernel is being used for
+     * @param forcesAreValid if the context has been modified since the last time step, this will be
+     *                       false to show that cached forces are invalid and must be recalculated.
+     *                       On exit, this should specify whether the cached forces are valid at the
+     *                       end of the step.
+     */
+    void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
+    /**
+     * Get the values of all global variables.
+     *
+     * @param context   the context in which to execute this kernel
+     * @param values    on exit, this contains the values
+     */
+    void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
+    /**
+     * Set the values of all global variables.
+     *
+     * @param context   the context in which to execute this kernel
+     * @param values    a vector containing the values
+     */
+    void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
+    /**
+     * Get the values of a per-DOF variable.
+     *
+     * @param context   the context in which to execute this kernel
+     * @param variable  the index of the variable to get
+     * @param values    on exit, this contains the values
+     */
+    void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
+    /**
+     * Set the values of a per-DOF variable.
+     *
+     * @param context   the context in which to execute this kernel
+     * @param variable  the index of the variable to get
+     * @param values    a vector containing the values
+     */
+    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
+private:
+    class ReorderListener;
+    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
+    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
+    void recordChangedParameters(ContextImpl& context);
+    CudaContext& cu;
+    double prevStepSize;
+    int numGlobalVariables;
+    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
+    mutable bool localValuesAreCurrent;
+    CudaArray* globalValues;
+    CudaArray* contextParameterValues;
+    CudaArray* sumBuffer;
+    CudaArray* energy;
+    CudaArray* uniformRandoms;
+    CudaArray* randomSeed;
+    CudaParameterSet* perDofValues;
+    mutable std::vector<std::vector<float> > localPerDofValuesFloat;
+    mutable std::vector<std::vector<double> > localPerDofValuesDouble;
+    std::vector<float> contextValuesFloat;
+    std::vector<double> contextValuesDouble;
+    std::vector<std::vector<CUfunction> > kernels;
+    std::vector<std::vector<std::vector<void*> > > kernelArgs;
+    CUfunction sumEnergyKernel, randomKernel;
+    std::vector<CustomIntegrator::ComputationType> stepType;
+    std::vector<bool> needsForces;
+    std::vector<bool> needsEnergy;
+    std::vector<bool> invalidatesForces;
+    std::vector<bool> merged;
+    std::vector<int> forceGroup;
+    std::vector<int> requiredGaussian;
+    std::vector<int> requiredUniform;
+    std::vector<std::string> parameterNames;
+};

 /**
 * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.

--- a/platforms/cuda2/src/CudaParameterSet.cpp
+++ b/platforms/cuda2/src/CudaParameterSet.cpp
@@ -39,11 +39,12 @@ using namespace std;
        throw OpenMMException(m.str());\
    }

-CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter) :
+CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
            context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
    int params = numParameters;
    int bufferCount = 0;
-    int elementSize = 4;
+    elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
+    string elementType = (useDoublePrecision ? "double" : "float");
    CUdeviceptr pointer;
    string errorMessage = "Error creating parameter set "+name;
    if (!bufferPerParameter) {
@@ -51,14 +52,14 @@ CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*4));
            std::stringstream name;
            name << "param" << (++bufferCount);
-            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 4, elementSize*4, pointer));
+            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, pointer));
            params -= 4;
        }
        if (params > 1) {
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*2));
            std::stringstream name;
            name << "param" << (++bufferCount);
-            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 2, elementSize*2, pointer));
+            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, pointer));
            params -= 2;
        }
    }
@@ -66,50 +67,55 @@ CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int
            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize));
        std::stringstream name;
        name << "param" << (++bufferCount);
-        buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), "float", 1, elementSize, pointer));
+        buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, pointer));
        params--;
    }
 }

 CudaParameterSet::~CudaParameterSet() {
+    if (context.getContextIsValid()) {
        string errorMessage = "Error freeing device memory";
        for (int i = 0; i < (int) buffers.size(); i++)
            CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
+    }
 }

-void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
+template <class T>
+void CudaParameterSet::getParameterValues(vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called getParameterValues() with vector of wrong type");
    values.resize(numObjects);
    for (int i = 0; i < numObjects; i++)
        values[i].resize(numParameters);
    int base = 0;
    string errorMessage = "Error downloading parameter set "+name;
    for (int i = 0; i < (int) buffers.size(); i++) {
-        if (buffers[i].getType() == "float4") {
-            vector<float4> data(numObjects);
+        if (buffers[i].getSize() == 4*elementSize) {
+            vector<T> data(4*numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++) {
-                values[j][base] = data[j].x;
+                values[j][base] = data[4*j];
                if (base+1 < numParameters)
-                    values[j][base+1] = data[j].y;
+                    values[j][base+1] = data[4*j+1];
                if (base+2 < numParameters)
-                    values[j][base+2] = data[j].z;
+                    values[j][base+2] = data[4*j+2];
                if (base+3 < numParameters)
-                    values[j][base+3] = data[j].w;
+                    values[j][base+3] = data[4*j+3];
            }
            base += 4;
        }
-        else if (buffers[i].getType() == "float2") {
-            vector<float2> data(numObjects);
+        else if (buffers[i].getSize() == 2*elementSize) {
+            vector<T> data(2*numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++) {
-                values[j][base] = data[j].x;
+                values[j][base] = data[2*j];
                if (base+1 < numParameters)
-                    values[j][base+1] = data[j].y;
+                    values[j][base+1] = data[2*j+1];
            }
            base += 2;
        }
-        else if (buffers[i].getType() == "float") {
-            vector<float> data(numObjects);
+        else if (buffers[i].getSize() == elementSize) {
+            vector<T> data(numObjects);
            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
            for (int j = 0; j < numObjects; j++)
                values[j][base] = data[j];
@@ -120,36 +126,39 @@ void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
    }
 }

-void CudaParameterSet::setParameterValues(const vector<vector<float> >& values) {
+template <class T>
+void CudaParameterSet::setParameterValues(const vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called setParameterValues() with vector of wrong type");
    int base = 0;
    string errorMessage = "Error uploading parameter set "+name;
    for (int i = 0; i < (int) buffers.size(); i++) {
-        if (buffers[i].getType() == "float4") {
-            vector<float4> data(numObjects);
+        if (buffers[i].getSize() == 4*elementSize) {
+            vector<T> data(4*numObjects);
            for (int j = 0; j < numObjects; j++) {
-                data[j].x = values[j][base];
+                data[4*j] = values[j][base];
                if (base+1 < numParameters)
-                    data[j].y = values[j][base+1];
+                    data[4*j+1] = values[j][base+1];
                if (base+2 < numParameters)
-                    data[j].z = values[j][base+2];
+                    data[4*j+2] = values[j][base+2];
                if (base+3 < numParameters)
-                    data[j].w = values[j][base+3];
+                    data[4*j+3] = values[j][base+3];
            }
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
            base += 4;
        }
-        else if (buffers[i].getType() == "float2") {
-            vector<float2> data(numObjects);
+        else if (buffers[i].getSize() == 2*elementSize) {
+            vector<T> data(2*numObjects);
            for (int j = 0; j < numObjects; j++) {
-                data[j].x = values[j][base];
+                data[2*j] = values[j][base];
                if (base+1 < numParameters)
-                    data[j].y = values[j][base+1];
+                    data[2*j+1] = values[j][base+1];
            }
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
            base += 2;
        }
-        else if (buffers[i].getType() == "float") {
-            vector<float> data(numObjects);
+        else if (buffers[i].getSize() == elementSize) {
+            vector<T> data(numObjects);
            for (int j = 0; j < numObjects; j++)
                data[j] = values[j][base];
            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
@@ -164,16 +173,26 @@ string CudaParameterSet::getParameterSuffix(int index, const std::string& extraS
    const string suffixes[] = {".x", ".y", ".z", ".w"};
    int buffer = -1;
    for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
-        if (index*sizeof(float) < buffers[i].getSize())
+        if (index*elementSize < buffers[i].getSize())
            buffer = i;
        else
-            index -= buffers[i].getSize()/sizeof(float);
+            index -= buffers[i].getSize()/elementSize;
    }
    if (buffer == -1)
        throw OpenMMException("Internal error: Illegal argument to CudaParameterSet::getParameterSuffix() ("+name+")");
    stringstream suffix;
    suffix << (buffer+1) << extraSuffix;
-    if (buffers[buffer].getType() != "float")
+    if (buffers[buffer].getSize() != elementSize)
        suffix << suffixes[index];
    return suffix.str();
 }
+
+/**
+ * Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
+ */
+namespace OpenMM {
+template void CudaParameterSet::getParameterValues<float>(vector<vector<float> >& values);
+template void CudaParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
+template void CudaParameterSet::getParameterValues<double>(vector<vector<double> >& values);
+template void CudaParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
+}
\ No newline at end of file
--- a/platforms/cuda2/src/CudaParameterSet.h
+++ b/platforms/cuda2/src/CudaParameterSet.h
@@ -51,8 +51,9 @@ public:
     * @param name             the name of the parameter set
     * @param bufferPerParameter  if true, a separate buffer is created for each parameter.  If false,
     *                            multiple parameters may be combined into a single buffer.
+     * @param useDoublePrecision  whether values should be stored as single or double precision
     */
-    CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false);
+    CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
    ~CudaParameterSet();
    /**
     * Get the number of parameters.
@@ -71,13 +72,15 @@ public:
     *
     * @param values on exit, values[i][j] contains the value of parameter j for object i
     */
-    void getParameterValues(std::vector<std::vector<float> >& values);
+    template <class T>
+    void getParameterValues(std::vector<std::vector<T> >& values);
    /**
     * Set the values of all parameters.
     *
     * @param values values[i][j] contains the value of parameter j for object i
     */
-    void setParameterValues(const std::vector<std::vector<float> >& values);
+    template <class T>
+    void setParameterValues(const std::vector<std::vector<T> >& values);
    /**
     * Get a set of CudaNonbondedUtilities::ParameterInfo objects which describe the Buffers
     * containing the data.
@@ -95,8 +98,7 @@ public:
    std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
 private:
    CudaContext& context;
-    int numParameters;
-    int numObjects;
+    int numParameters, numObjects, elementSize;
    std::string name;
    std::vector<CudaNonbondedUtilities::ParameterInfo> buffers;
 };

--- a/platforms/cuda2/src/kernels/customHbondForce.cu
+++ b/platforms/cuda2/src/kernels/customHbondForce.cu
+/**
+ * Convert a real4 to a real3 by removing its last element.
+ */
+inline __device__ real3 trim(real4 v) {
+    return make_real3(v.x, v.y, v.z);
+}
+
+/**
+ * This does nothing, and just exists to simply the code generation.
+ */
+inline __device__ real3 trim(real3 v) {
+    return v;
+}
+
+/**
+ * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 delta(real4 vec1, real4 vec2) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
+    return result;
+}
+
+/**
+ * Compute the difference between two vectors, taking periodic boundary conditions into account
+ * and setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+#ifdef USE_PERIODIC
+    result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+    result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+    result.z -= floor(result.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
+    return result;
+}
+
+/**
+ * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
+ */
+inline __device__ real computeAngle(real4 vec1, real4 vec2) {
+    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
+    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
+    real angle;
+    if (cosine > 0.99f || cosine < -0.99f) {
+        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
+
+        real3 crossProduct = cross(vec1, vec2);
+        real scale = vec1.w*vec2.w;
+        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        if (cosine < 0.0f)
+            angle = M_PI-angle;
+    }
+    else
+       angle = acos(cosine);
+    return angle;
+}
+
+/**
+ * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
+    real3 result = cross(vec1, vec2);
+    return make_real4(result.x, result.y, result.z, result.x*result.x + result.y*result.y + result.z*result.z);
+}
+
+/**
+ * Compute forces on donors.
+ */
+extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize
+        PARAMETER_ARGUMENTS) {
+    extern __shared__ real4 posBuffer[];
+    real energy = 0;
+    real3 f1 = make_real3(0);
+    real3 f2 = make_real3(0);
+    real3 f3 = make_real3(0);
+    for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x*gridDim.x) {
+        // Load information about the donor this thread will compute forces on.
+
+        int donorIndex = donorStart+blockIdx.x*blockDim.x+threadIdx.x;
+        int4 atoms, exclusionIndices;
+        real4 d1, d2, d3;
+        if (donorIndex < NUM_DONORS) {
+            atoms = donorAtoms[donorIndex];
+            d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
+            d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
+            d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
+#ifdef USE_EXCLUSIONS
+            exclusionIndices = exclusions[donorIndex];
+#endif
+        }
+        else
+            atoms = make_int4(-1, -1, -1, -1);
+        for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
+            // Load the next block of acceptors into local memory.
+
+            int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
+            if (threadIdx.x < blockSize) {
+                int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
+                posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
+                posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
+                posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
+            }
+            __syncthreads();
+            if (donorIndex < NUM_DONORS) {
+                for (int index = 0; index < blockSize; index++) {
+#ifdef USE_EXCLUSIONS
+                    int acceptorIndex = acceptorStart+index;
+                    if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
+                        continue;
+#endif
+                    // Compute the interaction between a donor and an acceptor.
+
+                    real4 a1 = posBuffer[3*index];
+                    real4 a2 = posBuffer[3*index+1];
+                    real4 a3 = posBuffer[3*index+2];
+                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
+#ifdef USE_CUTOFF
+                    if (deltaD1A1.w < CUTOFF_SQUARED) {
+#endif
+                        COMPUTE_DONOR_FORCE
+#ifdef USE_CUTOFF
+                    }
+#endif
+                }
+            }
+        }
+
+        // Write results
+
+        if (donorIndex < NUM_DONORS) {
+            if (atoms.x > -1) {
+                atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.y > -1) {
+                atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.z > -1) {
+                atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+        }
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
+/**
+ * Compute forces on acceptors.
+ */
+extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize
+        PARAMETER_ARGUMENTS) {
+    extern __shared__ real4 posBuffer[];
+    real3 f1 = make_real3(0);
+    real3 f2 = make_real3(0);
+    real3 f3 = make_real3(0);
+    for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x*gridDim.x) {
+        // Load information about the acceptor this thread will compute forces on.
+
+        int acceptorIndex = acceptorStart+blockIdx.x*blockDim.x+threadIdx.x;
+        int4 atoms, exclusionIndices;
+        real4 a1, a2, a3;
+        if (acceptorIndex < NUM_ACCEPTORS) {
+            atoms = acceptorAtoms[acceptorIndex];
+            a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
+            a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
+            a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
+#ifdef USE_EXCLUSIONS
+            exclusionIndices = exclusions[acceptorIndex];
+#endif
+        }
+        else
+            atoms = make_int4(-1, -1, -1, -1);
+        for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
+            // Load the next block of donors into local memory.
+
+            int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
+            if (threadIdx.x < blockSize) {
+                int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
+                posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
+                posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
+                posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
+            }
+            __syncthreads();
+            if (acceptorIndex < NUM_ACCEPTORS) {
+                for (int index = 0; index < blockSize; index++) {
+#ifdef USE_EXCLUSIONS
+                    int donorIndex = donorStart+index;
+                    if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
+                        continue;
+#endif
+                    // Compute the interaction between a donor and an acceptor.
+
+                    real4 d1 = posBuffer[3*index];
+                    real4 d2 = posBuffer[3*index+1];
+                    real4 d3 = posBuffer[3*index+2];
+                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
+#ifdef USE_CUTOFF
+                    if (deltaD1A1.w < CUTOFF_SQUARED) {
+#endif
+                        COMPUTE_ACCEPTOR_FORCE
+#ifdef USE_CUTOFF
+                    }
+#endif
+                }
+            }
+        }
+
+        // Write results
+
+        if (acceptorIndex < NUM_ACCEPTORS) {
+            if (atoms.x > -1) {
+                atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.y > -1) {
+                atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+            if (atoms.z > -1) {
+                atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0xFFFFFFFF)));
+                atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0xFFFFFFFF)));
+                __threadfence_block();
+            }
+        }
+    }
+}
--- a/platforms/cuda2/src/kernels/customIntegrator.cu
+++ b/platforms/cuda2/src/kernels/customIntegrator.cu
+extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real* result) {
+    __shared__ real tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = threadIdx.x;
+    real sum = 0;
+    for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
+        sum += sumBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
+}
+
+extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posDelta) {
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 position = posq[index];
+        position.x += posDelta[index].x;
+        position.y += posDelta[index].y;
+        position.z += posDelta[index].z;
+        posq[index] = position;
+        posDelta[index] = make_real4(0, 0, 0, 0);
+    }
+}
+
+extern "C" __global__ void generateRandomNumbers(float4* __restrict__ random, uint4* __restrict__ seed) {
+    uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x];
+    unsigned int carry = 0;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        // Generate three uniform random numbers.
+
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        unsigned int m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x2 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+
+        // Record the values.
+
+        random[index] = make_float4(x1, x2, x3, 0.0f);
+    }
+    seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
+}
--- a/platforms/cuda2/src/kernels/customIntegratorGlobal.cu
+++ b/platforms/cuda2/src/kernels/customIntegratorGlobal.cu
+extern "C" __global__ void computeGlobal(real2* __restrict__ dt, real* __restrict__ globals, real* __restrict__ params,
+        float uniform, float gaussian, const real* __restrict__ energy) {
+    COMPUTE_STEP
+}
--- a/platforms/cuda2/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda2/src/kernels/customIntegratorPerDof.cu
+inline __device__ double4 convertToDouble4(real4 a) {
+    return make_double4(a.x, a.y, a.z, a.w);
+}
+
+inline __device__ real4 convertFromDouble4(double4 a) {
+    return make_real4(a.x, a.y, a.z, a.w);
+}
+
+extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posDelta, real4* __restrict__ velm,
+        const long long* __restrict__ force, const real2* __restrict__ dt, const real* __restrict__ globals,
+        const real* __restrict__ params, real* __restrict__ sum, const float4* __restrict__ gaussianValues,
+        unsigned int randomIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
+        PARAMETER_ARGUMENTS) {
+    real stepSize = dt[0].y;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    randomIndex += index;
+    const double forceScale = 1.0/0xFFFFFFFF;
+    while (index < NUM_ATOMS) {
+#ifdef LOAD_POS_AS_DELTA
+        double4 position = convertToDouble4(posq[index]+posDelta[index]);
+#else
+        double4 position = convertToDouble4(posq[index]);
+#endif
+        double4 velocity = convertToDouble4(velm[index]);
+        double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);
+        double mass = 1.0/velocity.w;
+        if (velocity.w != 0.0) {
+            float4 gaussian = gaussianValues[randomIndex];
+            float4 uniform = uniformValues[index];
+            COMPUTE_STEP
+        }
+        randomIndex += blockDim.x*gridDim.x;
+        index += blockDim.x*gridDim.x;
+    }
+}
--- a/platforms/cuda2/src/kernels/customNonbonded.cu
+++ b/platforms/cuda2/src/kernels/customNonbonded.cu
+#ifdef USE_CUTOFF
+if (!isExcluded && r2 < CUTOFF_SQUARED) {
+#else
+if (!isExcluded) {
+#endif
+    real tempForce = 0;
+    COMPUTE_FORCE
+    dEdR += tempForce*invR;
+}
--- a/platforms/cuda2/tests/TestCudaCustomHbondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomHbondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of CustomHbondForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/CustomHbondForce.h"
+#include "openmm/HarmonicAngleForce.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/PeriodicTorsionForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void testHbond() {
+    CudaPlatform platform;
+
+    // Create a system using a CustomHbondForce.
+
+    System customSystem;
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    customSystem.addParticle(1.0);
+    CustomHbondForce* custom = new CustomHbondForce("0.5*kr*(distance(d1,a1)-r0)^2 + 0.5*ktheta*(angle(a1,d1,d2)-theta0)^2 + 0.5*kpsi*(angle(d1,a1,a2)-psi0)^2 + kchi*(1+cos(n*dihedral(a3,a2,a1,d1)-chi0))");
+    custom->addPerDonorParameter("r0");
+    custom->addPerDonorParameter("theta0");
+    custom->addPerDonorParameter("psi0");
+    custom->addPerAcceptorParameter("chi0");
+    custom->addPerAcceptorParameter("n");
+    custom->addGlobalParameter("kr", 0.4);
+    custom->addGlobalParameter("ktheta", 0.5);
+    custom->addGlobalParameter("kpsi", 0.6);
+    custom->addGlobalParameter("kchi", 0.7);
+    vector<double> parameters(3);
+    parameters[0] = 1.5;
+    parameters[1] = 1.7;
+    parameters[2] = 1.9;
+    custom->addDonor(1, 0, -1, parameters);
+    parameters.resize(2);
+    parameters[0] = 2.1;
+    parameters[1] = 2;
+    custom->addAcceptor(2, 3, 4, parameters);
+    custom->setCutoffDistance(10.0);
+    customSystem.addForce(custom);
+
+    // Create an identical system using HarmonicBondForce, HarmonicAngleForce, and PeriodicTorsionForce.
+
+    System standardSystem;
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    standardSystem.addParticle(1.0);
+    HarmonicBondForce* bond = new HarmonicBondForce();
+    bond->addBond(1, 2, 1.5, 0.4);
+    standardSystem.addForce(bond);
+    HarmonicAngleForce* angle = new HarmonicAngleForce();
+    angle->addAngle(0, 1, 2, 1.7, 0.5);
+    angle->addAngle(1, 2, 3, 1.9, 0.6);
+    standardSystem.addForce(angle);
+    PeriodicTorsionForce* torsion = new PeriodicTorsionForce();
+    torsion->addTorsion(1, 2, 3, 4, 2, 2.1, 0.7);
+    standardSystem.addForce(torsion);
+
+    // Set the atoms in various positions, and verify that both systems give identical forces and energy.
+
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<Vec3> positions(5);
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+    Context c1(customSystem, integrator1, platform);
+    Context c2(standardSystem, integrator2, platform);
+    for (int i = 0; i < 10; i++) {
+        for (int j = 0; j < (int) positions.size(); j++)
+            positions[j] = Vec3(2.0*genrand_real2(sfmt), 2.0*genrand_real2(sfmt), 2.0*genrand_real2(sfmt));
+        c1.setPositions(positions);
+        c2.setPositions(positions);
+        State s1 = c1.getState(State::Forces | State::Energy);
+        State s2 = c2.getState(State::Forces | State::Energy);
+        for (int i = 0; i < customSystem.getNumParticles(); i++)
+            ASSERT_EQUAL_VEC(s2.getForces()[i], s1.getForces()[i], TOL);
+        ASSERT_EQUAL_TOL(s2.getPotentialEnergy(), s1.getPotentialEnergy(), TOL);
+    }
+    
+    // Try changing the parameters and make sure it's still correct.
+    
+    parameters.resize(3);
+    parameters[0] = 1.4;
+    parameters[1] = 1.7;
+    parameters[2] = 1.9;
+    custom->setDonorParameters(0, 1, 0, -1, parameters);
+    parameters.resize(2);
+    parameters[0] = 2.2;
+    parameters[1] = 2;
+    custom->setAcceptorParameters(0, 2, 3, 4, parameters);
+    bond->setBondParameters(0, 1, 2, 1.4, 0.4);
+    torsion->setTorsionParameters(0, 1, 2, 3, 4, 2, 2.2, 0.7);
+    custom->updateParametersInContext(c1);
+    bond->updateParametersInContext(c2);
+    torsion->updateParametersInContext(c2);
+    State s1 = c1.getState(State::Forces | State::Energy);
+    State s2 = c2.getState(State::Forces | State::Energy);
+    for (int i = 0; i < customSystem.getNumParticles(); i++)
+        ASSERT_EQUAL_VEC(s2.getForces()[i], s1.getForces()[i], TOL);
+    ASSERT_EQUAL_TOL(s2.getPotentialEnergy(), s1.getPotentialEnergy(), TOL);
+}
+
+void testExclusions() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("(distance(d1,a1)-1)^2");
+    custom->addDonor(0, 1, -1, vector<double>());
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addAcceptor(2, 0, -1, vector<double>());
+    custom->addExclusion(1, 0);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(2, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-2, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.0, state.getPotentialEnergy(), TOL);
+}
+
+void testCutoff() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("(distance(d1,a1)-1)^2");
+    custom->addDonor(0, 1, -1, vector<double>());
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addAcceptor(2, 0, -1, vector<double>());
+    custom->setNonbondedMethod(CustomHbondForce::CutoffNonPeriodic);
+    custom->setCutoffDistance(2.5);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 3, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(2, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-2, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.0, state.getPotentialEnergy(), TOL);
+}
+
+void testCustomFunctions() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomHbondForce* custom = new CustomHbondForce("foo(distance(d1,a1))");
+    custom->addDonor(1, 0, -1, vector<double>());
+    custom->addDonor(2, 0, -1, vector<double>());
+    custom->addAcceptor(0, 1, -1, vector<double>());
+    vector<double> function(2);
+    function[0] = 0;
+    function[1] = 1;
+    custom->addFunction("foo", function, 0, 10);
+    system.addForce(custom);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0.1, 0.1, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, -0.1, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-0.1, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(0.1*2+0.1*2, state.getPotentialEnergy(), TOL);
+}
+
+int main() {
+    try {
+        testHbond();
+        testExclusions();
+        testCutoff();
+        testCustomFunctions();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cuda2/tests/TestCudaCustomIntegrator.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomIntegrator.cpp
--- a/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp