Merge https://github.com/openmm/openmm

5a06df78 · tic20 · 8dd60914 · a9223eea · 5a06df78 · 5a06df78
Commit 5a06df78 authored Mar 04, 2020 by tic20
20 changed files
--- a/platforms/cpu/src/CpuKernelFactory.cpp
+++ b/platforms/cpu/src/CpuKernelFactory.cpp
@@ -61,7 +61,7 @@ KernelImpl* CpuKernelFactory::createKernelImpl(std::string name, const Platform&
        return new CpuCalcGayBerneForceKernel(name, platform, data);
    if (name == IntegrateLangevinStepKernel::Name())
        return new CpuIntegrateLangevinStepKernel(name, platform, data);
-    if (name == IntegrateBAOABStepKernel::Name())
-        return new CpuIntegrateBAOABStepKernel(name, platform, data);
+    if (name == IntegrateLangevinMiddleStepKernel::Name())
+        return new CpuIntegrateLangevinMiddleStepKernel(name, platform, data);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str());
 }
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -609,12 +609,16 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        ewaldDispersionAlpha = alpha;
        useSwitchingFunction = false;
    }
+    if (nonbondedMethod == NoCutoff || nonbondedMethod == CutoffNonPeriodic)
+        exceptionsArePeriodic = false;
+    else
+        exceptionsArePeriodic = force.getExceptionsUsePeriodicBoundaryConditions();
    rfDielectric = force.getReactionFieldDielectric();
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;
-    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME);
+    data.isPeriodic |= (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME);
 }

 double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
@@ -699,6 +703,10 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    energy += nonbondedEnergy;
    if (includeDirect) {
        ReferenceLJCoulomb14 nonbonded14;
+        if (exceptionsArePeriodic) {
+            Vec3* boxVectors = extractBoxVectors(context);
+            nonbonded14.setPeriodic(boxVectors);
+        }
        bondForce.calculateForce(posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
        if (data.isPeriodic && nonbondedMethod != LJPME)
            energy += dispersionCoefficient/(boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2]);
@@ -923,7 +931,7 @@ void CpuCalcCustomNonbondedForceKernel::initialize(const System& system, const C
        force.getInteractionGroupParameters(i, set1, set2);
        interactionGroups.push_back(make_pair(set1, set2));
    }
-    data.isPeriodic = (nonbondedMethod == CutoffPeriodic);
+    data.isPeriodic |= (nonbondedMethod == CutoffPeriodic);
    nonbonded = new CpuCustomNonbondedForce(energyExpression, forceExpression, parameterNames, exclusions, energyParamDerivExpressions, data.threads);
    if (interactionGroups.size() > 0)
        nonbonded->setInteractionGroups(interactionGroups);
@@ -1016,7 +1024,7 @@ void CpuCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCFo
    obc.setSurfaceAreaEnergy((float) force.getSurfaceAreaEnergy());
    if (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff)
        obc.setUseCutoff((float) force.getCutoffDistance());
-    data.isPeriodic = (force.getNonbondedMethod() == GBSAOBCForce::CutoffPeriodic);
+    data.isPeriodic |= (force.getNonbondedMethod() == GBSAOBCForce::CutoffPeriodic);
 }

 double CpuCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -1190,7 +1198,7 @@ void CpuCalcCustomGBForceKernel::initialize(const System& system, const CustomGB
    ixn = new CpuCustomGBForce(numParticles, exclusions, valueExpressions, valueDerivExpressions, valueGradientExpressions, valueParamDerivExpressions,
        valueNames, valueTypes, energyExpressions, energyDerivExpressions, energyGradientExpressions, energyParamDerivExpressions, energyTypes,
        particleParameterNames, data.threads);
-    data.isPeriodic = (force.getNonbondedMethod() == CustomGBForce::CutoffPeriodic);
+    data.isPeriodic |= (force.getNonbondedMethod() == CustomGBForce::CutoffPeriodic);
 }

 double CpuCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -1252,7 +1260,7 @@ void CpuCalcCustomManyParticleForceKernel::initialize(const System& system, cons
    ixn = new CpuCustomManyParticleForce(force, data.threads);
    nonbondedMethod = CalcCustomManyParticleForceKernel::NonbondedMethod(force.getNonbondedMethod());
    cutoffDistance = force.getCutoffDistance();
-    data.isPeriodic = (nonbondedMethod == CutoffPeriodic);
+    data.isPeriodic |= (nonbondedMethod == CutoffPeriodic);
 }

 double CpuCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -1295,7 +1303,7 @@ CpuCalcGayBerneForceKernel::~CpuCalcGayBerneForceKernel() {

 void CpuCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) {
    ixn = new CpuGayBerneForce(force);
-    data.isPeriodic = (force.getNonbondedMethod() == GayBerneForce::CutoffPeriodic);
+    data.isPeriodic |= (force.getNonbondedMethod() == GayBerneForce::CutoffPeriodic);
    if (force.getNonbondedMethod() != GayBerneForce::NoCutoff) {
        double cutoff = force.getCutoffDistance();
        data.requestNeighborList(cutoff, 0.1*cutoff, true, ixn->getExclusions());
@@ -1353,12 +1361,12 @@ double CpuIntegrateLangevinStepKernel::computeKineticEnergy(ContextImpl& context
    return computeShiftedKineticEnergy(context, masses, 0.5*integrator.getStepSize());
 }

-CpuIntegrateBAOABStepKernel::~CpuIntegrateBAOABStepKernel() {
+CpuIntegrateLangevinMiddleStepKernel::~CpuIntegrateLangevinMiddleStepKernel() {
    if (dynamics)
        delete dynamics;
 }

-void CpuIntegrateBAOABStepKernel::initialize(const System& system, const BAOABLangevinIntegrator& integrator) {
+void CpuIntegrateLangevinMiddleStepKernel::initialize(const System& system, const LangevinMiddleIntegrator& integrator) {
    int numParticles = system.getNumParticles();
    masses.resize(numParticles);
    for (int i = 0; i < numParticles; ++i)
@@ -1366,7 +1374,7 @@ void CpuIntegrateBAOABStepKernel::initialize(const System& system, const BAOABLa
    data.random.initialize(integrator.getRandomNumberSeed(), data.threads.getNumThreads());
 }

-void CpuIntegrateBAOABStepKernel::execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid) {
+void CpuIntegrateLangevinMiddleStepKernel::execute(ContextImpl& context, const LangevinMiddleIntegrator& integrator) {
    double temperature = integrator.getTemperature();
    double friction = integrator.getFriction();
    double stepSize = integrator.getStepSize();
@@ -1377,18 +1385,18 @@ void CpuIntegrateBAOABStepKernel::execute(ContextImpl& context, const BAOABLange
        
        if (dynamics)
            delete dynamics;
-        dynamics = new CpuBAOABDynamics(context.getSystem().getNumParticles(), stepSize, friction, temperature, data.threads, data.random);
+        dynamics = new CpuLangevinMiddleDynamics(context.getSystem().getNumParticles(), stepSize, friction, temperature, data.threads, data.random);
        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
        prevTemp = temperature;
        prevFriction = friction;
        prevStepSize = stepSize;
    }
-    dynamics->update(context, posData, velData, masses, forcesAreValid, integrator.getConstraintTolerance());
+    dynamics->update(context, posData, velData, masses, integrator.getConstraintTolerance());
    ReferencePlatform::PlatformData* refData = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
    refData->time += stepSize;
    refData->stepCount++;
 }

-double CpuIntegrateBAOABStepKernel::computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator) {
+double CpuIntegrateLangevinMiddleStepKernel::computeKineticEnergy(ContextImpl& context, const LangevinMiddleIntegrator& integrator) {
    return computeShiftedKineticEnergy(context, masses, 0.0);
 }
--- a/platforms/cpu/src/CpuBAOABDynamics.cpp
+++ b/platforms/cpu/src/CpuBAOABDynamics.cpp
-
-/* Portions copyright (c) 2006-2019 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2020 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -24,28 +23,25 @@
 */

 #include "SimTKOpenMMUtilities.h"
-#include "CpuBAOABDynamics.h"
+#include "CpuLangevinMiddleDynamics.h"

 using namespace OpenMM;
 using namespace std;

-CpuBAOABDynamics::CpuBAOABDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, ThreadPool& threads, CpuRandom& random) : 
-           ReferenceBAOABDynamics(numberOfAtoms, deltaT, friction, temperature), threads(threads), random(random) {
+CpuLangevinMiddleDynamics::CpuLangevinMiddleDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, ThreadPool& threads, CpuRandom& random) : 
+           ReferenceLangevinMiddleDynamics(numberOfAtoms, deltaT, friction, temperature), threads(threads), random(random) {
 }

-CpuBAOABDynamics::~CpuBAOABDynamics() {
+CpuLangevinMiddleDynamics::~CpuLangevinMiddleDynamics() {
 }

-void CpuBAOABDynamics::updatePart1(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
-                                   vector<Vec3>& forces, vector<double>& inverseMasses, vector<Vec3>& xPrime) {
+void CpuLangevinMiddleDynamics::updatePart1(int numberOfAtoms, vector<Vec3>& velocities, vector<Vec3>& forces, vector<double>& inverseMasses) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
-    this->atomCoordinates = &atomCoordinates[0];
    this->velocities = &velocities[0];
    this->forces = &forces[0];
    this->inverseMasses = &inverseMasses[0];
-    this->xPrime = &xPrime[0];
    
    // Signal the threads to start running and wait for them to finish.
    
@@ -53,7 +49,7 @@ void CpuBAOABDynamics::updatePart1(int numberOfAtoms, vector<Vec3>& atomCoordina
    threads.waitForThreads();
 }

-void CpuBAOABDynamics::updatePart2(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
+void CpuLangevinMiddleDynamics::updatePart2(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
                                   vector<double>& inverseMasses, vector<Vec3>& xPrime) {
    // Record the parameters for the threads.
    
@@ -69,7 +65,7 @@ void CpuBAOABDynamics::updatePart2(int numberOfAtoms, vector<Vec3>& atomCoordina
    threads.waitForThreads();
 }

-void CpuBAOABDynamics::updatePart3(ContextImpl& context, int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
+void CpuLangevinMiddleDynamics::updatePart3(ContextImpl& context, int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
                                   vector<double>& inverseMasses, vector<Vec3>& xPrime) {
    // Record the parameters for the threads.
    
@@ -83,26 +79,18 @@ void CpuBAOABDynamics::updatePart3(ContextImpl& context, int numberOfAtoms, vect
    
    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate3(threadIndex); });
    threads.waitForThreads();
-    context.calcForcesAndEnergy(true, false);
-    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate4(threadIndex); });
-    threads.waitForThreads();
 }

-void CpuBAOABDynamics::threadUpdate1(int threadIndex) {
-    const double halfdt = 0.5*getDeltaT();
+void CpuLangevinMiddleDynamics::threadUpdate1(int threadIndex) {
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();

-    for (int i = start; i < end; i++) {
-        if (inverseMasses[i] != 0.0) {
-            velocities[i] += (halfdt*inverseMasses[i])*forces[i];
-            xPrime[i] = atomCoordinates[i] + velocities[i]*halfdt;
-            oldx[i] = xPrime[i];
-        }
-    }
+    for (int i = start; i < end; i++)
+        if (inverseMasses[i] != 0.0)
+            velocities[i] += (getDeltaT()*inverseMasses[i])*forces[i];
 }

-void CpuBAOABDynamics::threadUpdate2(int threadIndex) {
+void CpuLangevinMiddleDynamics::threadUpdate2(int threadIndex) {
    const double halfdt = 0.5*getDeltaT();
    const double kT = BOLTZ*getTemperature();
    const double friction = getFriction();
@@ -113,34 +101,22 @@ void CpuBAOABDynamics::threadUpdate2(int threadIndex) {

    for (int i = start; i < end; i++) {
        if (inverseMasses[i] != 0.0) {
-            velocities[i] += (xPrime[i]-oldx[i])/halfdt;
+            xPrime[i] = atomCoordinates[i] + velocities[i]*halfdt;
            Vec3 noise(random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex));
            velocities[i] = vscale*velocities[i] + noisescale*sqrt(kT*inverseMasses[i])*noise;
-            atomCoordinates[i] = xPrime[i];
-            xPrime[i] = atomCoordinates[i] + velocities[i]*halfdt;
+            xPrime[i] = xPrime[i] + velocities[i]*halfdt;
            oldx[i] = xPrime[i];
        }
    }
 }

-void CpuBAOABDynamics::threadUpdate3(int threadIndex) {
-    const double halfdt = 0.5*getDeltaT();
+void CpuLangevinMiddleDynamics::threadUpdate3(int threadIndex) {
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();

    for (int i = start; i < end; ++i)
        if (inverseMasses[i] != 0.0) {
-            velocities[i] += (xPrime[i]-oldx[i])/halfdt;
+            velocities[i] += (xPrime[i]-oldx[i])/getDeltaT();
            atomCoordinates[i] = xPrime[i];
        }
 }
-
-void CpuBAOABDynamics::threadUpdate4(int threadIndex) {
-    const double halfdt = 0.5*getDeltaT();
-    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
-    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
-
-    for (int i = start; i < end; ++i)
-        if (inverseMasses[i] != 0.0)
-            velocities[i] += (halfdt*inverseMasses[i])*forces[i];
-}
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -74,7 +74,7 @@ CpuPlatform::CpuPlatform() {
    registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateBAOABStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
    platformProperties.push_back(CpuThreads());
    platformProperties.push_back(CpuDeterministicForces());
    int threads = getNumProcessors();

--- a/platforms/cpu/tests/TestCpuBAOABLangevinIntegrator.cpp
+++ b/platforms/cpu/tests/TestCpuBAOABLangevinIntegrator.cpp
@@ -30,7 +30,7 @@
 * -------------------------------------------------------------------------- */

 #include "CpuTests.h"
-#include "TestBAOABLangevinIntegrator.h"
+#include "TestLangevinMiddleIntegrator.h"

 void runPlatformTests() {
 }
--- a/platforms/cuda/CMakeLists.txt
+++ b/platforms/cuda/CMakeLists.txt
@@ -19,7 +19,7 @@ endif(BUILD_TESTING AND OPENMM_BUILD_CUDA_TESTS)

 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-SET(OPENMM_SOURCE_SUBDIRS .)
+SET(OPENMM_SOURCE_SUBDIRS . ../common)


 # Collect up information about the version of the OpenMM library we're building
@@ -76,12 +76,14 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)

 # Set variables needed for encoding kernel sources into a C++ class

-SET(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
-SET(CUDA_SOURCE_CLASS CudaKernelSources)
-SET(CUDA_KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.cpp)
-SET(CUDA_KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.h)
-SET(SOURCE_FILES ${SOURCE_FILES} ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H})
+SET(KERNEL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+SET(KERNEL_SOURCE_CLASS CudaKernelSources)
+SET(KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.cpp)
+SET(KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.h)
+SET(COMMON_KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/../common/src/CommonKernelSources.cpp)
+SET(SOURCE_FILES ${SOURCE_FILES} ${KERNELS_CPP} ${KERNELS_H} ${COMMON_KERNELS_CPP})
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/../common/src)

 # Install headers


--- a/platforms/cuda/EncodeCUDAFiles.cmake
+++ b/platforms/cuda/EncodeCUDAFiles.cmake
-FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
-SET(CUDA_FILE_DECLARATIONS)
-SET(CUDA_FILE_DEFINITIONS)
-CONFIGURE_FILE(${CUDA_SOURCE_DIR}/${CUDA_SOURCE_CLASS}.cpp.in ${CUDA_KERNELS_CPP})
-FOREACH(file ${CUDA_KERNELS})
-    # Load the file contents and process it.
-    FILE(STRINGS ${file} file_content NEWLINE_CONSUME)
-    # Replace all backslashes by double backslashes as they are being put in a C string.
-    # Be careful not to replace the backslash before a semicolon as that is the CMAKE
-    # internal escaping of a semicolon to prevent it from acting as a list seperator.
-    STRING(REGEX REPLACE "\\\\([^;])" "\\\\\\\\\\1" file_content "${file_content}")
-    # Escape double quotes as being put in a C string.
-    STRING(REPLACE "\"" "\\\"" file_content "${file_content}")
-    # Split in separate C strings for each line.
-    STRING(REPLACE "\n" "\\n\"\n\"" file_content "${file_content}")
-
-    # Determine a name for the variable that will contain this file's contents
-    FILE(RELATIVE_PATH filename ${CUDA_SOURCE_DIR}/kernels ${file})
-    STRING(LENGTH ${filename} filename_length)
-    MATH(EXPR filename_length ${filename_length}-3)
-    STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name)
-
-    # Record the variable declaration and definition.
-    SET(CUDA_FILE_DECLARATIONS ${CUDA_FILE_DECLARATIONS}static\ const\ std::string\ ${variable_name};\n)
-    FILE(APPEND ${CUDA_KERNELS_CPP} const\ string\ ${CUDA_SOURCE_CLASS}::${variable_name}\ =\ \"${file_content}\"\;\n)
-ENDFOREACH(file)
-CONFIGURE_FILE(${CUDA_SOURCE_DIR}/${CUDA_SOURCE_CLASS}.h.in ${CUDA_KERNELS_H})
--- a/platforms/cuda/include/CudaArray.h
+++ b/platforms/cuda/include/CudaArray.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -28,7 +28,8 @@
 * -------------------------------------------------------------------------- */

 #include "openmm/OpenMMException.h"
-#include "windowsExportCuda.h"
+#include "openmm/common/windowsExportCommon.h"
+#include "openmm/common/ArrayInterface.h"
 #include <cuda.h>
 #include <iostream>
 #include <sstream>
@@ -43,7 +44,7 @@ class CudaContext;
 * for working with it and for copying data to and from device memory.
 */

-class OPENMM_EXPORT_CUDA CudaArray {
+class OPENMM_EXPORT_COMMON CudaArray : public ArrayInterface {
 public:
    /**
     * Create a CudaArray object.  The object is allocated on the heap with the "new" operator.
@@ -80,7 +81,7 @@ public:
     * @param elementSize       the size of each element in bytes
     * @param name              the name of the array
     */
-    void initialize(CudaContext& context, int size, int elementSize, const std::string& name);
+    void initialize(ComputeContext& context, int size, int elementSize, const std::string& name);
    /**
     * Initialize this object.  The template argument is the data type of each array element.
     *
@@ -89,7 +90,7 @@ public:
     * @param name              the name of the array
     */
    template <class T>
-    void initialize(CudaContext& context, int size, const std::string& name) {
+    void initialize(ComputeContext& context, int size, const std::string& name) {
        initialize(context, size, sizeof(T), name);
    }
    /**
@@ -120,6 +121,10 @@ public:
    const std::string& getName() const {
        return name;
    }
+    /**
+     * Get the context this array belongs to.
+     */
+    ComputeContext& getContext();
    /**
     * Get a pointer to the device memory.
     */
@@ -130,41 +135,15 @@ public:
     * Copy the values in a vector to the device memory.
     */
    template <class T>
-    void upload(const std::vector<T>& data, bool convert = true) {
-        if (convert && data.size() == size && sizeof(T) != elementSize) {
-            if (sizeof(T) == 2*elementSize) {
-                // Convert values from double to single precision.
-                const double* d = reinterpret_cast<const double*>(&data[0]);
-                std::vector<float> v(elementSize*size/sizeof(float));
-                for (int i = 0; i < v.size(); i++)
-                    v[i] = (float) d[i];
-                upload(&v[0], true);
-                return;
-            }
-            if (2*sizeof(T) == elementSize) {
-                // Convert values from single to double precision.
-                const float* d = reinterpret_cast<const float*>(&data[0]);
-                std::vector<double> v(elementSize*size/sizeof(double));
-                for (int i = 0; i < v.size(); i++)
-                    v[i] = (double) d[i];
-                upload(&v[0], true);
-                return;
-            }
-        }
-        if (sizeof(T) != elementSize || data.size() != size)
-            throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
-        upload(&data[0], true);
+    void upload(const std::vector<T>& data, bool convert=false) {
+        ArrayInterface::upload(data, convert);
    }
    /**
     * Copy the values in the Buffer to a vector.
     */
    template <class T>
    void download(std::vector<T>& data) const {
-        if (sizeof(T) != elementSize)
-            throw OpenMMException("Error downloading array "+name+": The specified vector has the wrong element size");
-        if (data.size() != size)
-            data.resize(size);
-        download(&data[0], true);
+        ArrayInterface::download(data);
    }
    /**
     * Copy the values in an array to the device memory.
@@ -173,7 +152,7 @@ public:
     * @param blocking if true, this call will block until the transfer is complete.  If false,
     *                 the source array  must be in page-locked memory.
     */
-    void upload(const void* data, bool blocking = true);
+    void upload(const void* data, bool blocking=true);
    /**
     * Copy the values in the device memory to an array.
     * 
@@ -181,13 +160,13 @@ public:
     * @param blocking if true, this call will block until the transfer is complete.  If false,
     *                 the destination array must be in page-locked memory.
     */
-    void download(void* data, bool blocking = true) const;
+    void download(void* data, bool blocking=true) const;
    /**
     * Copy the values in the device memory to a second array.
     * 
     * @param dest     the destination array to copy to
     */
-    void copyTo(CudaArray& dest) const;
+    void copyTo(ArrayInterface& dest) const;
 private:
    CudaContext* context;
    CUdeviceptr pointer;

--- a/platforms/cuda/include/CudaBondedUtilities.h
+++ b/platforms/cuda/include/CudaBondedUtilities.h
@@ -28,13 +28,15 @@
 * -------------------------------------------------------------------------- */

 #include "CudaArray.h"
-#include "CudaContext.h"
 #include "openmm/System.h"
+#include "openmm/common/BondedUtilities.h"
 #include <string>
 #include <vector>

 namespace OpenMM {

+class CudaContext;
+    
 /**
 * This class provides a generic mechanism for evaluating bonded interactions.  You write only
 * the source code needed to compute one interaction, and this class takes care of creating
@@ -78,7 +80,7 @@ namespace OpenMM {
 * from your interaction code.
 */

-class OPENMM_EXPORT_CUDA CudaBondedUtilities {
+class OPENMM_EXPORT_COMMON CudaBondedUtilities : public BondedUtilities {
 public:
    CudaBondedUtilities(CudaContext& context);
    /**
@@ -99,6 +101,15 @@ public:
     * refer to it by this name.
     */
    std::string addArgument(CUdeviceptr data, const std::string& type);
+    /**
+     * Add an argument that should be passed to the interaction kernel.
+     * 
+     * @param data    the array containing the data to pass
+     * @param type    the data type contained in the memory (e.g. "float4")
+     * @return the name that will be used for the argument.  Any code you pass to addInteraction() should
+     * refer to it by this name.
+     */
+    std::string addArgument(ArrayInterface& data, const std::string& type);
    /**
     * Register that the interaction kernel will be computing the derivative of the potential energy
     * with respect to a parameter.

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -28,7 +28,6 @@
 * -------------------------------------------------------------------------- */

 #include <map>
-#include <queue>
 #include <string>
 #include <utility>
 #define __CL_ENABLE_EXCEPTIONS
@@ -40,22 +39,21 @@
 #include <cuda.h>
 #include <builtin_types.h>
 #include <vector_functions.h>
-#include "windowsExportCuda.h"
+#include "openmm/common/windowsExportCommon.h"
 #include "CudaArray.h"
+#include "CudaBondedUtilities.h"
+#include "CudaExpressionUtilities.h"
+#include "CudaIntegrationUtilities.h"
+#include "CudaNonbondedUtilities.h"
 #include "CudaPlatform.h"
+#include "openmm/OpenMMException.h"
+#include "openmm/common/ComputeContext.h"
 #include "openmm/Kernel.h"

 typedef unsigned int tileflags;

 namespace OpenMM {

-class CudaForceInfo;
-class CudaExpressionUtilities;
-class CudaIntegrationUtilities;
-class CudaBondedUtilities;
-class CudaNonbondedUtilities;
-class System;
-
 /**
 * This class contains the information associated with a Context by the CUDA Platform.  Each CudaContext is
 * specific to a particular device, and manages data structures and kernels for that device.  When running a simulation
@@ -67,7 +65,7 @@ class System;
 * thread is not used and calculations are performed on the main application thread.
 */

-class OPENMM_EXPORT_CUDA CudaContext {
+class OPENMM_EXPORT_COMMON CudaContext : public ComputeContext {
 public:
    class WorkTask;
    class WorkThread;
@@ -85,14 +83,6 @@ public:
     * have been initialized.
     */
    void initialize();
-    /**
-     * Add a CudaForceInfo to this context.
-     */
-    void addForce(CudaForceInfo* force);
-    /**
-     * Get all CudaForceInfos that have been added to this context.
-     */
-    std::vector<CudaForceInfo*>& getForceInfos();
    /**
     * Get the CUcontext associated with this object.
     */
@@ -134,6 +124,14 @@ public:
    CudaPlatform::PlatformData& getPlatformData() {
        return platformData;
    }
+    /**
+     * Get the number of contexts being used for the current simulation.
+     * This is relevant when a simulation is parallelized across multiple devices.  In that case,
+     * one CudaContext is created for each device.
+     */
+    int getNumContexts() const {
+        return platformData.contexts.size();
+    }
    /**
     * Get the index of this context in the list stored in the PlatformData.
     */
@@ -152,6 +150,28 @@ public:
     * Reset the context to using the default stream for execution.
     */
    void restoreDefaultStream();
+    /**
+     * Construct an uninitialized array of the appropriate class for this platform.  The returned
+     * value should be created on the heap with the "new" operator.
+     */
+    CudaArray* createArray();
+    /**
+     * Construct a ComputeEvent object of the appropriate class for this platform.
+     */
+    ComputeEvent createEvent();
+    /**
+     * Compile source code to create a ComputeProgram.
+     *
+     * @param source             the source code of the program
+     * @param defines            a set of preprocessor definitions (name, value) to define when compiling the program
+     */
+    ComputeProgram compileProgram(const std::string source, const std::map<std::string, std::string>& defines=std::map<std::string, std::string>());
+    /**
+     * Convert an array to an CudaArray.  If the argument is already an CudaArray, this simply casts it.
+     * If the argument is a ComputeArray that wraps a CudaArray, this returns the wrapped array.  For any
+     * other argument, this throws an exception.
+     */
+    CudaArray& unwrap(ArrayInterface& array) const;
    /**
     * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
     */
@@ -176,6 +196,20 @@ public:
    CudaArray& getForce() {
        return force;
    }
+    /**
+     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
+     * This is a synonym for getForce().  It exists to satisfy the ComputeContext interface.
+     */
+    CudaArray& getLongForceBuffer() {
+        return force;
+    }
+    /**
+     * All CUDA devices support 64 bit atomics, so this throws an exception.
+     * @return 
+     */
+    ArrayInterface& getForceBuffers() {
+        throw OpenMMException("CUDA platform does not use floating point force buffers");
+    }
    /**
     * Get the array which contains the buffer in which energy is computed.
     */
@@ -196,10 +230,14 @@ public:
        return pinnedBuffer;
    }
    /**
-     * Get the host-side vector which contains the index of each atom.
+     * Get a shared ThreadPool that code can use to parallelize operations.
+     * 
+     * Because this object is freely available to all code, care is needed to avoid conflicts.  Only use it
+     * from the main thread, and make sure all operations are complete before you invoke any other code that
+     * might make use of it
     */
-    const std::vector<int>& getAtomIndex() const {
-        return atomIndex;
+    ThreadPool& getThreadPool() {
+        return getPlatformData().threads;
    }
    /**
     * Get the array which contains the index of each atom.
@@ -207,20 +245,6 @@ public:
    CudaArray& getAtomIndexArray() {
        return atomIndexDevice;
    }
-    /**
-     * Get the number of cells by which the positions are offset.
-     */
-    std::vector<int4>& getPosCellOffsets() {
-        return posCellOffsets;
-    }
-    /**
-     * Replace all occurrences of a list of substrings.
-     *
-     * @param input   a string to process
-     * @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string
-     * @return a new string produced by performing the replacements
-     */
-    std::string replaceStrings(const std::string& input, const std::map<std::string, std::string>& replacements) const;
    /**
     * Create a CUDA module from source code.
     *
@@ -266,7 +290,7 @@ public:
    /**
     * Set all elements of an array to 0.
     */
-    void clearBuffer(CudaArray& array);
+    void clearBuffer(ArrayInterface& array);
    /**
     * Set all elements of an array to 0.
     *
@@ -277,7 +301,7 @@ public:
    /**
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     */
-    void addAutoclearBuffer(CudaArray& array);
+    void addAutoclearBuffer(ArrayInterface& array);
    /**
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     *
@@ -294,89 +318,47 @@ public:
     */
    double reduceEnergy();
    /**
-     * Get the current simulation time.
-     */
-    double getTime() {
-        return time;
-    }
-    /**
-     * Set the current simulation time.
-     */
-    void setTime(double t) {
-        time = t;
-    }
-    /**
-     * Get the number of integration steps that have been taken.
-     */
-    int getStepCount() {
-        return stepCount;
-    }
-    /**
-     * Set the number of integration steps that have been taken.
-     */
-    void setStepCount(int steps) {
-        stepCount = steps;
-    }
-    /**
-     * Get the number of times forces or energy has been computed.
-     */
-    int getComputeForceCount() {
-        return computeForceCount;
-    }
-    /**
-     * Set the number of times forces or energy has been computed.
-     */
-    void setComputeForceCount(int count) {
-        computeForceCount = count;
-    }
-    /**
-     * Get the number of time steps since the atoms were reordered.
-     */
-    int getStepsSinceReorder() const {
-        return stepsSinceReorder;
-    }
-    /**
-     * Set the number of time steps since the atoms were reordered.
+     * Get the number of blocks of TileSize atoms.
     */
-    void setStepsSinceReorder(int steps) {
-        stepsSinceReorder = steps;
+    int getNumAtomBlocks() const {
+        return numAtomBlocks;
    }
    /**
-     * Get the flag that marks whether the current force evaluation is valid.
+     * Get the standard number of thread blocks to use when executing kernels.
     */
-    bool getForcesValid() const {
-        return forcesValid;
+    int getNumThreadBlocks() const {
+        return numThreadBlocks;
    }
    /**
-     * Get the flag that marks whether the current force evaluation is valid.
+     * Get the maximum number of threads in a thread block supported by this device.
     */
-    void setForcesValid(bool valid) {
-        forcesValid = valid;
+    int getMaxThreadBlockSize() const {
+        return 1024;
    }
    /**
-     * Get the number of atoms.
+     * Get whether the device being used is a CPU.  In some cases, different algorithms
+     * may be more efficient on CPUs and GPUs.
     */
-    int getNumAtoms() const {
-        return numAtoms;
+    bool getIsCPU() const {
+        return false;
    }
    /**
-     * Get the number of atoms, rounded up to a multiple of TileSize.  This is the actual size of
-     * most arrays with one element per atom.
+     * Get the SIMD width of the device being used.
     */
-    int getPaddedNumAtoms() const {
-        return paddedNumAtoms;
+    int getSIMDWidth() const {
+        return 32;
    }
    /**
-     * Get the number of blocks of TileSize atoms.
+     * Get whether the device being used supports 64 bit atomic operations on global memory.
     */
-    int getNumAtomBlocks() const {
-        return numAtomBlocks;
+    bool getSupports64BitGlobalAtomics() const {
+        return true;
    }
    /**
-     * Get the standard number of thread blocks to use when executing kernels.
+     * Get whether the device being used supports double precision math.
     */
-    int getNumThreadBlocks() const {
-        return numThreadBlocks;
+    bool getSupportsDoublePrecision() const {
+        return true;
    }
    /**
     * Get whether double precision is being used.
@@ -396,15 +378,6 @@ public:
    bool getBoxIsTriclinic() const {
        return boxIsTriclinic;
    }
-    /**
-     * Convert a number to a string in a format suitable for including in a kernel.
-     * This takes into account whether the context uses single or double precision.
-     */
-    std::string doubleToString(double value) const;
-    /**
-     * Convert a number to a string in a format suitable for including in a kernel.
-     */
-    std::string intToString(int value) const;
    /**
     * Convert a CUDA result code to the corresponding string description.
     */
@@ -503,6 +476,11 @@ public:
    CudaNonbondedUtilities& getNonbondedUtilities() {
        return *nonbonded;
    }
+    /**
+     * This should be called by the Integrator from its own initialize() method.
+     * It ensures all contexts are fully initialized.
+     */
+    void initializeContexts();
    /**
     * Set the particle charges.  These are packed into the fourth element of the posq array.
     */
@@ -512,62 +490,6 @@ public:
     * do that, this returns true the first time it is called, and false on all subsequent calls.
     */
    bool requestPosqCharges();
-    /**
-     * Get the thread used by this context for executing parallel computations.
-     */
-    WorkThread& getWorkThread() {
-        return *thread;
-    }
-    /**
-     * Get whether atoms were reordered during the most recent force/energy computation.
-     */
-    bool getAtomsWereReordered() const {
-        return atomsWereReordered;
-    }
-    /**
-     * Set whether atoms were reordered during the most recent force/energy computation.
-     */
-    void setAtomsWereReordered(bool wereReordered) {
-        atomsWereReordered = wereReordered;
-    }
-    /**
-     * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
-     * together in the arrays.
-     */
-    void reorderAtoms();
-    /**
-     * Add a listener that should be called whenever atoms get reordered.  The CudaContext
-     * assumes ownership of the object, and deletes it when the context itself is deleted.
-     */
-    void addReorderListener(ReorderListener* listener);
-    /**
-     * Get the list of ReorderListeners.
-     */
-    std::vector<ReorderListener*>& getReorderListeners() {
-        return reorderListeners;
-    }
-    /**
-     * Add a pre-computation that should be called at the very start of force and energy evaluations.
-     * The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
-     */
-    void addPreComputation(ForcePreComputation* computation);
-    /**
-     * Get the list of ForcePreComputations.
-     */
-    std::vector<ForcePreComputation*>& getPreComputations() {
-        return preComputations;
-    }
-    /**
-     * Add a post-computation that should be called at the very end of force and energy evaluations.
-     * The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
-     */
-    void addPostComputation(ForcePostComputation* computation);
-    /**
-     * Get the list of ForcePostComputations.
-     */
-    std::vector<ForcePostComputation*>& getPostComputations() {
-        return postComputations;
-    }
    /**
     * Get the names of all parameters with respect to which energy derivatives are computed.
     */
@@ -590,52 +512,25 @@ public:
     */
    void addEnergyParameterDerivative(const std::string& param);
    /**
-     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
-     * This should be called whenever force field parameters change.  It will cause the definitions
-     * and order to be revalidated.
-     */
-    void invalidateMolecules();
-    /**
-     * Mark that the current molecule definitions from one particular force (and hence the atom order)
-     * may be invalid.  This should be called whenever force field parameters change.  It will cause the
-     * definitions and order to be revalidated.
+     * Wait until all work that has been queued (kernel executions, asynchronous data transfers, etc.)
+     * has been submitted to the device.  This does not mean it has necessarily been completed.
+     * Calling this periodically may improve the responsiveness of the computer's GUI, but at the
+     * expense of reduced simulation performance.
     */
-    bool invalidateMolecules(CudaForceInfo* force);
+    void flushQueue();
 private:
    /**
     * Compute a sorted list of device indices in decreasing order of desirability
     */
    std::vector<int> getDevicePrecedence();
-
-    struct Molecule;
-    struct MoleculeGroup;
-    class VirtualSiteInfo;
-    void findMoleculeGroups();
-    /**
-     * Ensure that all molecules marked as "identical" really are identical.  This should be
-     * called whenever force field parameters change.  If necessary, it will rebuild the list
-     * of molecules and resort the atoms.
-     */
-    void validateMolecules();
-    /**
-     * This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
-     */
-    template <class Real, class Real4, class Mixed, class Mixed4>
-    void reorderAtomsImpl();
    static bool hasInitializedCuda;
-    const System& system;
-    double time, computeCapability;
+    double computeCapability;
    CudaPlatform::PlatformData& platformData;
    int deviceIndex;
    int contextIndex;
-    int stepCount;
-    int computeForceCount;
-    int stepsSinceReorder;
-    int numAtoms;
-    int paddedNumAtoms;
    int numAtomBlocks;
    int numThreadBlocks;
-    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, boxIsTriclinic, hasCompilerKernel, isNvccAvailable, forcesValid, hasAssignedPosqCharges;
+    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasCompilerKernel, isNvccAvailable, hasAssignedPosqCharges;
    bool isLinkedContext;
    std::string compiler, tempDir, cacheDir, gpuArchitecture;
    float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
@@ -653,10 +548,6 @@ private:
    CUfunction clearSixBuffersKernel;
    CUfunction reduceEnergyKernel;
    CUfunction setChargesKernel;
-    std::vector<CudaForceInfo*> forces;
-    std::vector<Molecule> molecules;
-    std::vector<MoleculeGroup> moleculeGroups;
-    std::vector<int4> posCellOffsets;
    void* pinnedBuffer;
    CudaArray posq;
    CudaArray posqCorrection;
@@ -669,119 +560,37 @@ private:
    CudaArray chargeBuffer;
    std::vector<std::string> energyParamDerivNames;
    std::map<std::string, double> energyParamDerivWorkspace;
-    std::vector<int> atomIndex;
    std::vector<CUdeviceptr> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;
-    std::vector<ReorderListener*> reorderListeners;
-    std::vector<ForcePreComputation*> preComputations;
-    std::vector<ForcePostComputation*> postComputations;
    CudaIntegrationUtilities* integration;
    CudaExpressionUtilities* expression;
    CudaBondedUtilities* bonded;
    CudaNonbondedUtilities* nonbonded;
-    WorkThread* thread;
    Kernel compilerKernel;
 };

-struct CudaContext::Molecule {
-    std::vector<int> atoms;
-    std::vector<int> constraints;
-    std::vector<std::vector<int> > groups;
-};
-
-struct CudaContext::MoleculeGroup {
-    std::vector<int> atoms;
-    std::vector<int> instances;
-    std::vector<int> offsets;
-};
-
 /**
- * This abstract class defines a task to be executed on the worker thread.
+ * This class exists only for backward compatibility.  Use ComputeContext::WorkTask instead.
 */
-class OPENMM_EXPORT_CUDA CudaContext::WorkTask {
-public:
-    virtual void execute() = 0;
-    virtual ~WorkTask() {
-    }
-};
-
-class OPENMM_EXPORT_CUDA CudaContext::WorkThread {
-public:
-    struct ThreadData;
-    WorkThread();
-    ~WorkThread();
-    /**
-     * Request that a task be executed on the worker thread.  The argument should have been allocated on the
-     * heap with the "new" operator.  After its execute() method finishes, the object will be deleted automatically.
-     */
-    void addTask(CudaContext::WorkTask* task);
-    /**
-     * Get whether the worker thread is idle, waiting for a task to be added.
-     */
-    bool isWaiting();
-    /**
-     * Get whether the worker thread has exited.
-     */
-    bool isFinished();
-    /**
-     * Block until all tasks have finished executing and the worker thread is idle.
-     */
-    void flush();
-private:
-    std::queue<CudaContext::WorkTask*> tasks;
-    bool waiting, finished;
-    pthread_mutex_t queueLock;
-    pthread_cond_t waitForTaskCondition, queueEmptyCondition;
-    pthread_t thread;
+class OPENMM_EXPORT_COMMON CudaContext::WorkTask : public ComputeContext::WorkTask {
 };

 /**
- * This abstract class defines a function to be executed whenever atoms get reordered.
- * Objects that need to know when reordering happens should create a ReorderListener
- * and register it by calling addReorderListener().
+ * This class exists only for backward compatibility.  Use ComputeContext::ReorderListener instead.
 */
-class OPENMM_EXPORT_CUDA CudaContext::ReorderListener {
-public:
-    virtual void execute() = 0;
-    virtual ~ReorderListener() {
-    }
+class OPENMM_EXPORT_COMMON CudaContext::ReorderListener : public ComputeContext::ReorderListener {
 };

 /**
- * This abstract class defines a function to be executed at the very beginning of force and
- * energy evaluation, before any other calculation has been done.  It is useful for operations
- * that need to be performed at a nonstandard point in the process.  After creating a
- * ForcePreComputation, register it by calling addForcePreComputation().
+ * This class exists only for backward compatibility.  Use ComputeContext::ForcePreComputation instead.
 */
-class OPENMM_EXPORT_CUDA CudaContext::ForcePreComputation {
-public:
-    virtual ~ForcePreComputation() {
-    }
-    /**
-     * @param includeForce  true if forces should be computed
-     * @param includeEnergy true if potential energy should be computed
-     * @param groups        a set of bit flags for which force groups to include
-     */
-    virtual void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
+class OPENMM_EXPORT_COMMON CudaContext::ForcePreComputation : public ComputeContext::ForcePreComputation {
 };

 /**
- * This abstract class defines a function to be executed at the very end of force and
- * energy evaluation, after all other calculations have been done.  It is useful for operations
- * that need to be performed at a nonstandard point in the process.  After creating a
- * ForcePostComputation, register it by calling addForcePostComputation().
+ * This class exists only for backward compatibility.  Use ComputeContext::ForcePostComputation instead.
 */
-class OPENMM_EXPORT_CUDA CudaContext::ForcePostComputation {
-public:
-    virtual ~ForcePostComputation() {
-    }
-    /**
-     * @param includeForce  true if forces should be computed
-     * @param includeEnergy true if potential energy should be computed
-     * @param groups        a set of bit flags for which force groups to include
-     * @return an optional contribution to add to the potential energy.
-     */
-    virtual double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
+class OPENMM_EXPORT_COMMON CudaContext::ForcePostComputation : public ComputeContext::ForcePostComputation {
 };

 } // namespace OpenMM

--- a/platforms/cuda/include/CudaEvent.h
+++ b/platforms/cuda/include/CudaEvent.h
+#ifndef OPENMM_CUDAEVENT_H_
+#define OPENMM_CUDAEVENT_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaContext.h"
+#include "openmm/common/ComputeEvent.h"
+
+namespace OpenMM {
+
+/**
+ * This is the CUDA implementation of the ComputeKernelImpl interface. 
+ */
+
+class CudaEvent : public ComputeEventImpl {
+public:
+    CudaEvent(CudaContext& context);
+    ~CudaEvent();
+    /**
+     * Place the event into the device's execution queue.
+     */
+    void enqueue();
+    /**
+     * Block until all operations started before the call to enqueue() have completed.
+     */
+    void wait();
+private:
+    CudaContext& context;
+    CUevent event;
+    bool eventCreated;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CUDAEVENT_H_*/
--- a/platforms/cuda/include/CudaExpressionUtilities.h
+++ b/platforms/cuda/include/CudaExpressionUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -27,106 +27,20 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "CudaContext.h"
-#include "openmm/TabulatedFunction.h"
-#include "lepton/CustomFunction.h"
-#include "lepton/ExpressionTreeNode.h"
-#include "lepton/ParsedExpression.h"
-#include <map>
-#include <sstream>
-#include <string>
-#include <utility>
+#include "openmm/common/ExpressionUtilities.h"
+#include "openmm/common/windowsExportCommon.h"

 namespace OpenMM {

 /**
- * This class is used by various classes to generate CUDA source code implementing
- * user defined mathematical expressions.
+ * This class exists only for backward compatibility.  It adds no features beyond
+ * the base ExpressionUtilities class.
 */

-class OPENMM_EXPORT_CUDA CudaExpressionUtilities {
+class OPENMM_EXPORT_COMMON CudaExpressionUtilities : public ExpressionUtilities {
 public:
-    CudaExpressionUtilities(CudaContext& context);
-    /**
-     * Generate the source code for calculating a set of expressions.
-     *
-     * @param expressions    the expressions to generate code for (keys are the variables to store the output values in)
-     * @param variables      defines the source code to generate for each variable that may appear in the expressions.  Keys are
-     *                       variable names, and the values are the code to generate for them.
-     * @param functions      the tabulated functions that may appear in the expressions
-     * @param functionNames  defines the variable name for each tabulated function that may appear in the expressions
-     * @param prefix         a prefix to put in front of temporary variables
-     * @param tempType       the type of value to use for temporary variables (defaults to "real")
-     */
-    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
-            const std::vector<const TabulatedFunction*>& functions, const std::vector<std::pair<std::string, std::string> >& functionNames,
-            const std::string& prefix, const std::string& tempType="real");
-    /**
-     * Generate the source code for calculating a set of expressions.
-     *
-     * @param expressions    the expressions to generate code for (keys are the variables to store the output values in)
-     * @param variables      defines the source code to generate for each variable or precomputed sub-expression that may appear in the expressions.
-     *                       Each entry is an ExpressionTreeNode, and the code to generate wherever an identical node appears.
-     * @param functions      the tabulated functions that may appear in the expressions
-     * @param functionNames  defines the variable name for each tabulated function that may appear in the expressions
-     * @param prefix         a prefix to put in front of temporary variables
-     * @param tempType       the type of value to use for temporary variables (defaults to "real")
-     */
-    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
-            const std::vector<const TabulatedFunction*>& functions, const std::vector<std::pair<std::string, std::string> >& functionNames,
-            const std::string& prefix, const std::string& tempType="real");
-    /**
-     * Calculate the spline coefficients for a tabulated function that appears in expressions.
-     *
-     * @param function   the function for which to compute coefficients
-     * @param width      on output, the number of floats used for each value
-     * @return the spline coefficients
-     */
-    std::vector<float> computeFunctionCoefficients(const TabulatedFunction& function, int& width);
-    /**
-     * Get a Lepton::CustomFunction that can be used to represent a TabulatedFunction when parsing expressions.
-     * 
-     * @param function   the function for which to get a placeholder
-     */
-    Lepton::CustomFunction* getFunctionPlaceholder(const TabulatedFunction& function);
-    /**
-     * Get a Lepton::CustomFunction that can be used to represent the periodicdistance() function when parsing expressions.
-     */
-    Lepton::CustomFunction* getPeriodicDistancePlaceholder();
-private:
-    class FunctionPlaceholder : public Lepton::CustomFunction {
-        public:
-            FunctionPlaceholder(int numArgs) : numArgs(numArgs) {
-            }
-            int getNumArguments() const {
-                return numArgs;
-            }
-            double evaluate(const double* arguments) const {
-                return 0.0;
-            }
-            double evaluateDerivative(const double* arguments, const int* derivOrder) const {
-                return 0.0;
-            }
-            CustomFunction* clone() const {
-                return new FunctionPlaceholder(numArgs);
-            }
-        private:
-            int numArgs;
-    };
-    void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
-            std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
-            const std::vector<const TabulatedFunction*>& functions, const std::vector<std::pair<std::string, std::string> >& functionNames,
-            const std::string& prefix, const std::vector<std::vector<double> >& functionParams, const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
-    std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
-    void findRelatedCustomFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
-            std::vector<const Lepton::ExpressionTreeNode*>& nodes);
-    void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
-            std::map<int, const Lepton::ExpressionTreeNode*>& powers);
-    void callFunction(std::stringstream& out, std::string singleFn, std::string doubleFn, const std::string& arg, const std::string& tempType);
-    void callFunction2(std::stringstream& out, std::string singleFn, std::string doubleFn, const std::string& arg1, const std::string& arg2, const std::string& tempType);
-    std::vector<std::vector<double> > computeFunctionParameters(const std::vector<const TabulatedFunction*>& functions);
-    CudaContext& context;
-    FunctionPlaceholder fp1, fp2, fp3, periodicDistance;
+    CudaExpressionUtilities(ComputeContext& context) : ExpressionUtilities(context) {
+    }
 };

 } // namespace OpenMM

--- a/platforms/cuda/include/CudaFFT3D.h
+++ b/platforms/cuda/include/CudaFFT3D.h
@@ -52,7 +52,7 @@ namespace OpenMM {
 * multiply every value of the original data set by the total number of data points.
 */

-class OPENMM_EXPORT_CUDA CudaFFT3D {
+class OPENMM_EXPORT_COMMON CudaFFT3D {
 public:
    /**
     * Create an CudaFFT3D object for performing transforms of a particular size.

--- a/platforms/cuda/include/CudaForceInfo.h
+++ b/platforms/cuda/include/CudaForceInfo.h
@@ -27,36 +27,18 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "windowsExportCuda.h"
+#include "openmm/common/ComputeForceInfo.h"
+#include "openmm/common/windowsExportCommon.h"
 #include <vector>

 namespace OpenMM {

 /**
- * This class is used by the Cuda implementation of a Force class to convey information
- * about the behavior and requirements of that force.
+ * This class exists solely for backward compatibility.  It adds no features beyond the ones
+ * in ComputeForceInfo.
 */

-class OPENMM_EXPORT_CUDA CudaForceInfo {
-public:
-    CudaForceInfo() {
-    }
-    /**
-     * Get whether or not two particles have identical force field parameters.
-     */
-    virtual bool areParticlesIdentical(int particle1, int particle2);
-    /**
-     * Get the number of particle groups defined by this force.
-     */
-    virtual int getNumParticleGroups();
-    /**
-     * Get the list of particles in a particular group.
-     */
-    virtual void getParticlesInGroup(int index, std::vector<int>& particles);
-    /**
-     * Get whether two particle groups are identical.
-     */
-    virtual bool areGroupsIdentical(int group1, int group2);
+class OPENMM_EXPORT_COMMON CudaForceInfo : public ComputeForceInfo {
 };

 } // namespace OpenMM

--- a/platforms/cuda/include/CudaIntegrationUtilities.h
+++ b/platforms/cuda/include/CudaIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -27,147 +27,48 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

+#include "CudaArray.h"
 #include "openmm/System.h"
-#include "CudaContext.h"
-#include "windowsExportCuda.h"
-#include <iosfwd>
+#include "openmm/common/IntegrationUtilities.h"
+#include "openmm/common/windowsExportCommon.h"
+#include <cuda.h>
+#include <builtin_types.h>

 namespace OpenMM {

+class CudaContext;
+
 /**
 * This class implements features that are used by many different integrators, including
 * common workspace arrays, random number generation, and enforcing constraints.
 */

-class OPENMM_EXPORT_CUDA CudaIntegrationUtilities {
+class OPENMM_EXPORT_COMMON CudaIntegrationUtilities : public IntegrationUtilities {
 public:
    CudaIntegrationUtilities(CudaContext& context, const System& system);
    ~CudaIntegrationUtilities();
    /**
     * Get the array which contains position deltas.
     */
-    CudaArray& getPosDelta() {
-        return posDelta;
-    }
+    CudaArray& getPosDelta();
    /**
     * Get the array which contains random values.  Each element is a float4, whose components
     * are independent, normally distributed random numbers with mean 0 and variance 1.
     */
-    CudaArray& getRandom() {
-        return random;
-    }
+    CudaArray& getRandom();
    /**
     * Get the array which contains the current step size.
     */
-    CudaArray& getStepSize() {
-        return stepSize;
-    }
-    /**
-     * Set the size to use for the next step.
-     */
-    void setNextStepSize(double size);
-    /**
-     * Get the size that was used for the last step.
-     */
-    double getLastStepSize();
-    /**
-     * Apply constraints to the atom positions.
-     *
-     * @param tol             the constraint tolerance
-     */
-    void applyConstraints(double tol);
-    /**
-     * Apply constraints to the atom velocities.
-     *
-     * @param tol             the constraint tolerance
-     */
-    void applyVelocityConstraints(double tol);
-    /**
-     * Initialize the random number generator.
-     */
-    void initRandomNumberGenerator(unsigned int randomNumberSeed);
-    /**
-     * Ensure that sufficient random numbers are available in the array, and generate new ones if not.
-     *
-     * @param numValues     the number of random float4's that will be required
-     * @return the index in the array at which to start reading
-     */
-    int prepareRandomNumbers(int numValues);
-    /**
-     * Compute the positions of virtual sites.
-     */
-    void computeVirtualSites();
+    CudaArray& getStepSize();
    /**
     * Distribute forces from virtual sites to the atoms they are based on.
     */
    void distributeForcesFromVirtualSites();
-    /**
-     * Create a checkpoint recording the current state of the random number generator.
-     * 
-     * @param stream    an output stream the checkpoint data should be written to
-     */
-    void createCheckpoint(std::ostream& stream);
-    /**
-     * Load a checkpoint that was written by createCheckpoint().
-     * 
-     * @param stream    an input stream the checkpoint data should be read from
-     */
-    void loadCheckpoint(std::istream& stream);
-    /**
-     * Compute the kinetic energy of the system, possibly shifting the velocities in time to account
-     * for a leapfrog integrator.
-     * 
-     * @param timeShift   the amount by which to shift the velocities in time
-     */
-    double computeKineticEnergy(double timeShift);
 private:
-    void applyConstraints(bool constrainVelocities, double tol);
-    CudaContext& context;
-    CUfunction settlePosKernel, settleVelKernel;
-    CUfunction shakePosKernel, shakeVelKernel;
-    CUfunction ccmaDirectionsKernel;
-    CUfunction ccmaPosForceKernel, ccmaVelForceKernel;
-    CUfunction ccmaMultiplyKernel;
-    CUfunction ccmaUpdateKernel;
-    CUfunction vsitePositionKernel, vsiteForceKernel;
-    CUfunction randomKernel, timeShiftKernel;
-    CudaArray posDelta;
-    CudaArray settleAtoms;
-    CudaArray settleParams;
-    CudaArray shakeAtoms;
-    CudaArray shakeParams;
-    CudaArray random;
-    CudaArray randomSeed;
-    CudaArray stepSize;
-    CudaArray ccmaAtoms;
-    CudaArray ccmaDistance;
-    CudaArray ccmaReducedMass;
-    CudaArray ccmaAtomConstraints;
-    CudaArray ccmaNumAtomConstraints;
-    CudaArray ccmaConstraintMatrixColumn;
-    CudaArray ccmaConstraintMatrixValue;
-    CudaArray ccmaDelta1;
-    CudaArray ccmaDelta2;
-    CudaArray ccmaConverged;
+    void applyConstraintsImpl(bool constrainVelocities, double tol);
    int* ccmaConvergedMemory;
    CUdeviceptr ccmaConvergedDeviceMemory;
    CUevent ccmaEvent;
-    CudaArray vsite2AvgAtoms;
-    CudaArray vsite2AvgWeights;
-    CudaArray vsite3AvgAtoms;
-    CudaArray vsite3AvgWeights;
-    CudaArray vsiteOutOfPlaneAtoms;
-    CudaArray vsiteOutOfPlaneWeights;
-    CudaArray vsiteLocalCoordsIndex;
-    CudaArray vsiteLocalCoordsAtoms;
-    CudaArray vsiteLocalCoordsWeights;
-    CudaArray vsiteLocalCoordsPos;
-    CudaArray vsiteLocalCoordsStartIndex;
-    int randomPos;
-    int lastSeed, numVsites;
-    double2 lastStepSize;
-    struct ShakeCluster;
-    struct ConstraintOrderer;
 };

 } // namespace OpenMM

--- a/platforms/cuda/include/CudaKernel.h
+++ b/platforms/cuda/include/CudaKernel.h
+#ifndef OPENMM_CUDAKERNEL_H_
+#define OPENMM_CUDAKERNEL_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaArray.h"
+#include "CudaContext.h"
+#include <string>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This is the CUDA implementation of the ComputeKernelImpl interface. 
+ */
+
+class CudaKernel : public ComputeKernelImpl {
+public:
+    /**
+     * Create a new CudaKernel.
+     * 
+     * @param context      the context this kernel belongs to
+     * @param kernel       the kernel to be invoked
+     * @param name         the name of the kernel function
+     */
+    CudaKernel(CudaContext& context, CUfunction kernel, const std::string& name);
+    /**
+     * Get the name of this kernel.
+     */
+    std::string getName() const;
+    /**
+     * Execute this kernel.
+     *
+     * @param threads      the maximum number of threads that should be used.  Depending on the
+     *                     computing device, it may choose to use fewer threads than this number.
+     * @param blockSize    the number of threads in each thread block.  If this is omitted, a
+     *                     default size that is appropriate for the computing device is used.
+     */
+    void execute(int threads, int blockSize=-1);
+protected:
+    /**
+     * Add an argument to pass the kernel when it is invoked, where the value is a
+     * subclass of ArrayInterface.
+     * 
+     * @param value     the value to pass to the kernel
+     */
+    void addArrayArg(ArrayInterface& value);
+    /**
+     * Add an argument to pass the kernel when it is invoked, where the value is a primitive type.
+     * 
+     * @param value    a pointer to the argument value
+     * @param size     the size of the value in bytes
+     */
+    void addPrimitiveArg(const void* value, int size);
+    /**
+     * Add a placeholder for an argument without specifying its value.
+     */
+    void addEmptyArg();
+    /**
+     * Add an argument to pass the kernel when it is invoked, where the value is a
+     * subclass of ArrayInterface.
+     * 
+     * @param index     the index of the argument to set
+     * @param value     the value to pass to the kernel
+     */
+    void setArrayArg(int index, ArrayInterface& value);
+    /**
+     * Add an argument to pass the kernel when it is invoked, where the value is a primitive type.
+     * 
+     * @param index     the index of the argument to set
+     * @param value    a pointer to the argument value
+     * @param size     the size of the value in bytes
+     */
+    void setPrimitiveArg(int index, const void* value, int size);
+private:
+    CudaContext& context;
+    CUfunction kernel;
+    std::string name;
+    std::vector<double4> primitiveArgs;
+    std::vector<CudaArray*> arrayArgs;
+    std::vector<void*> argPointers;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CUDAKERNEL_H_*/
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -259,350 +259,6 @@ private:
    CudaContext& cu;
 };

-/**
- * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
-public:
-    CudaCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcHarmonicBondForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the HarmonicBondForce this kernel will be used for
-     */
-    void initialize(const System& system, const HarmonicBondForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the HarmonicBondForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
-private:
-    class ForceInfo;
-    int numBonds;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    CudaArray params;
-};
-
-/**
- * This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
-public:
-    CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomBondForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system), params(NULL) {
-    }
-    ~CudaCalcCustomBondForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomBondForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomBondForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomBondForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
-private:
-    class ForceInfo;
-    int numBonds;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    CudaParameterSet* params;
-    CudaArray globals;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-};
-
-/**
- * This kernel is invoked by HarmonicAngleForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
-public:
-    CudaCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcHarmonicAngleForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the HarmonicAngleForce this kernel will be used for
-     */
-    void initialize(const System& system, const HarmonicAngleForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the HarmonicAngleForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
-private:
-    class ForceInfo;
-    int numAngles;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    CudaArray params;
-};
-
-/**
- * This kernel is invoked by CustomAngleForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
-public:
-    CudaCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomAngleForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system), params(NULL) {
-    }
-    ~CudaCalcCustomAngleForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomAngleForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomAngleForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomAngleForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
-private:
-    class ForceInfo;
-    int numAngles;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    CudaParameterSet* params;
-    CudaArray globals;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-};
-
-/**
- * This kernel is invoked by PeriodicTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
-public:
-    CudaCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcPeriodicTorsionForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the PeriodicTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const PeriodicTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the PeriodicTorsionForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
-private:
-    class ForceInfo;
-    int numTorsions;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    CudaArray params;
-};
-
-/**
- * This kernel is invoked by RBTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
-public:
-    CudaCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcRBTorsionForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the RBTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const RBTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the RBTorsionForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
-private:
-    class ForceInfo;
-    int numTorsions;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    CudaArray params1;
-    CudaArray params2;
-};
-
-/**
- * This kernel is invoked by CMAPTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
-public:
-    CudaCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCMAPTorsionForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CMAPTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const CMAPTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CMAPTorsionForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
-private:
-    class ForceInfo;
-    int numTorsions;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    std::vector<int2> mapPositionsVec;
-    CudaArray coefficients;
-    CudaArray mapPositions;
-    CudaArray torsionMaps;
-};
-
-/**
- * This kernel is invoked by CustomTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
-public:
-    CudaCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomTorsionForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system), params(NULL) {
-    }
-    ~CudaCalcCustomTorsionForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomTorsionForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
-private:
-    class ForceInfo;
-    int numTorsions;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    CudaParameterSet* params;
-    CudaArray globals;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-};
-
 /**
 * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
 */
@@ -737,539 +393,48 @@ private:
 };

 /**
- * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
+ * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
 */
-class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
+class CudaCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
 public:
-    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomNonbondedForceKernel(name, platform),
-            cu(cu), params(NULL), forceCopy(NULL), system(system), hasInitializedKernel(false) {
+    CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcCustomCVForceKernel(name, platform),
+            cu(cu), hasInitializedListeners(false) {
    }
-    ~CudaCalcCustomNonbondedForceKernel();
    /**
     * Initialize the kernel.
     *
     * @param system     the System this kernel will be applied to
-     * @param force      the CustomNonbondedForce this kernel will be used for
+     * @param force      the CustomCVForce this kernel will be used for
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
     */
-    void initialize(const System& system, const CustomNonbondedForce& force);
+    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
    /**
     * Execute the kernel to calculate the forces and/or energy.
     *
     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
     * @param includeForces  true if forces should be calculated
     * @param includeEnergy  true if the energy should be calculated
     * @return the potential energy due to the force
     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
+    /**
+     * Copy state information to the inner context.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void copyState(ContextImpl& context, ContextImpl& innerContext);
    /**
     * Copy changed parameters over to a context.
     *
     * @param context    the context to copy parameters to
-     * @param force      the CustomNonbondedForce to copy the parameters from
+     * @param force      the CustomCVForce to copy the parameters from
     */
-    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
+    void copyParametersToContext(ContextImpl& context, const CustomCVForce& force);
 private:
    class ForceInfo;
-    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource, const std::vector<std::string>& tableTypes);
-    CudaContext& cu;
-    ForceInfo* info;
-    CudaParameterSet* params;
-    CudaArray globals;
-    CudaArray interactionGroupData, filteredGroupData, numGroupTiles;
-    CUfunction interactionGroupKernel, prepareNeighborListKernel, buildNeighborListKernel;
-    std::vector<void*> interactionGroupArgs, prepareNeighborListArgs, buildNeighborListArgs;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    std::vector<CudaArray> tabulatedFunctions;
-    double longRangeCoefficient;
-    std::vector<double> longRangeCoefficientDerivs;
-    bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs, useNeighborList;
-    int numGroupThreadBlocks;
-    CustomNonbondedForce* forceCopy;
-    const System& system;
-};
-
-/**
- * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
- */
-class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
-public:
-    CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcGBSAOBCForceKernel(name, platform), cu(cu),
-            hasCreatedKernels(false) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the GBSAOBCForce this kernel will be used for
-     */
-    void initialize(const System& system, const GBSAOBCForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the GBSAOBCForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
-private:
-    class ForceInfo;
-    double prefactor, surfaceAreaFactor, cutoff;
-    bool hasCreatedKernels;
-    int maxTiles;
-    CudaContext& cu;
-    ForceInfo* info;
-    CudaArray charges;
-    CudaArray params;
-    CudaArray bornSum;
-    CudaArray bornRadii;
-    CudaArray bornForce;
-    CudaArray obcChain;
-    CUfunction computeBornSumKernel;
-    CUfunction reduceBornSumKernel;
-    CUfunction force1Kernel;
-    CUfunction reduceBornForceKernel;
-    std::vector<void*> computeSumArgs, force1Args;
-};
-
-/**
- * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
- */
-class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
-public:
-    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomGBForceKernel(name, platform),
-            hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), energyDerivChain(NULL), system(system) {
-    }
-    ~CudaCalcCustomGBForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomGBForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomGBForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomGBForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
-private:
-    class ForceInfo;
-    double cutoff;
-    bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
-    int maxTiles, numComputedValues;
-    CudaContext& cu;
-    ForceInfo* info;
-    CudaParameterSet* params;
-    CudaParameterSet* computedValues;
-    CudaParameterSet* energyDerivs;
-    CudaParameterSet* energyDerivChain;
-    std::vector<CudaParameterSet*> dValuedParam;
-    std::vector<CudaArray> dValue0dParam;
-    CudaArray longEnergyDerivs;
-    CudaArray globals;
-    CudaArray valueBuffers;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    std::vector<CudaArray> tabulatedFunctions;
-    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
-    const System& system;
-    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
-    std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
-    std::string pairValueSrc, pairEnergySrc;
-    std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
-};
-
-/**
- * This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
-public:
-    CudaCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomExternalForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), system(system), params(NULL) {
-    }
-    ~CudaCalcCustomExternalForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomExternalForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomExternalForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomExternalForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
-private:
-    class ForceInfo;
-    int numParticles;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    const System& system;
-    CudaParameterSet* params;
-    CudaArray globals;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-};
-
-/**
- * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
- */
-class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
-public:
-    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomHbondForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), donorParams(NULL), acceptorParams(NULL), system(system) {
-    }
-    ~CudaCalcCustomHbondForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomHbondForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomHbondForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomHbondForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
-private:
-    class ForceInfo;
-    int numDonors, numAcceptors;
-    bool hasInitializedKernel;
-    CudaContext& cu;
-    ForceInfo* info;
-    CudaParameterSet* donorParams;
-    CudaParameterSet* acceptorParams;
-    CudaArray globals;
-    CudaArray donors;
-    CudaArray acceptors;
-    CudaArray donorExclusions;
-    CudaArray acceptorExclusions;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    std::vector<CudaArray> tabulatedFunctions;
-    std::vector<void*> donorArgs, acceptorArgs;
-    const System& system;
-    CUfunction donorKernel, acceptorKernel;
-};
-
-/**
- * This kernel is invoked by CustomCentroidBondForce to calculate the forces acting on the system.
- */
-class CudaCalcCustomCentroidBondForceKernel : public CalcCustomCentroidBondForceKernel {
-public:
-    CudaCalcCustomCentroidBondForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomCentroidBondForceKernel(name, platform),
-            cu(cu), params(NULL), system(system) {
-    }
-    ~CudaCalcCustomCentroidBondForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomCentroidBondForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomCentroidBondForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomCentroidBondForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomCentroidBondForce& force);
-
-private:
-    class ForceInfo;
-    int numGroups, numBonds;
-    bool needEnergyParamDerivs;
-    CudaContext& cu;
-    ForceInfo* info;
-    CudaParameterSet* params;
-    CudaArray globals;
-    CudaArray groupParticles;
-    CudaArray groupWeights;
-    CudaArray groupOffsets;
-    CudaArray groupForces;
-    CudaArray bondGroups;
-    CudaArray centerPositions;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    std::vector<CudaArray> tabulatedFunctions;
-    std::vector<void*> groupForcesArgs;
-    CUfunction computeCentersKernel, groupForcesKernel, applyForcesKernel;
-    const System& system;
-};
-
-/**
- * This kernel is invoked by CustomCompoundBondForce to calculate the forces acting on the system.
- */
-class CudaCalcCustomCompoundBondForceKernel : public CalcCustomCompoundBondForceKernel {
-public:
-    CudaCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomCompoundBondForceKernel(name, platform),
-            cu(cu), params(NULL), system(system) {
-    }
-    ~CudaCalcCustomCompoundBondForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomCompoundBondForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomCompoundBondForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomCompoundBondForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force);
-
-private:
-    class ForceInfo;
-    int numBonds;
-    CudaContext& cu;
-    ForceInfo* info;
-    CudaParameterSet* params;
-    CudaArray globals;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    std::vector<CudaArray> tabulatedFunctions;
-    const System& system;
-};
-
-/**
- * This kernel is invoked by CustomManyParticleForce to calculate the forces acting on the system.
- */
-class CudaCalcCustomManyParticleForceKernel : public CalcCustomManyParticleForceKernel {
-public:
-    CudaCalcCustomManyParticleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomManyParticleForceKernel(name, platform),
-            hasInitializedKernel(false), cu(cu), params(NULL), system(system) {
-    }
-    ~CudaCalcCustomManyParticleForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomManyParticleForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomManyParticleForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomManyParticleForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomManyParticleForce& force);
-
-private:
-    class ForceInfo;
-    CudaContext& cu;
-    ForceInfo* info;
-    bool hasInitializedKernel;
-    NonbondedMethod nonbondedMethod;
-    int maxNeighborPairs, forceWorkgroupSize, findNeighborsWorkgroupSize;
-    CudaParameterSet* params;
-    CudaArray particleTypes;
-    CudaArray orderIndex;
-    CudaArray particleOrder;
-    CudaArray exclusions;
-    CudaArray exclusionStartIndex;
-    CudaArray blockCenter;
-    CudaArray blockBoundingBox;
-    CudaArray neighborPairs;
-    CudaArray numNeighborPairs;
-    CudaArray neighborStartIndex;
-    CudaArray numNeighborsForAtom;
-    CudaArray neighbors;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    std::vector<CudaArray> tabulatedFunctions;
-    std::vector<void*> forceArgs, blockBoundsArgs, neighborsArgs, startIndicesArgs, copyPairsArgs;
-    const System& system;
-    CUfunction forceKernel, blockBoundsKernel, neighborsKernel, startIndicesKernel, copyPairsKernel;
-    CUdeviceptr globalsPtr;
-    CUevent event;
-};
-
-/**
- * This kernel is invoked by GayBerneForce to calculate the forces acting on the system.
- */
-class CudaCalcGayBerneForceKernel : public CalcGayBerneForceKernel {
-public:
-    CudaCalcGayBerneForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcGayBerneForceKernel(name, platform), cu(cu),
-            hasInitializedKernels(false) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the GayBerneForce this kernel will be used for
-     */
-    void initialize(const System& system, const GayBerneForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the GayBerneForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const GayBerneForce& force);
-private:
-    class ForceInfo;
-    class ReorderListener;
-    void sortAtoms();
-    CudaContext& cu;
-    ForceInfo* info;
-    bool hasInitializedKernels;
-    int numRealParticles, numExceptions, maxNeighborBlocks;
-    GayBerneForce::NonbondedMethod nonbondedMethod;
-    CudaArray sortedParticles;
-    CudaArray axisParticleIndices;
-    CudaArray sigParams;
-    CudaArray epsParams;
-    CudaArray scale;
-    CudaArray exceptionParticles;
-    CudaArray exceptionParams;
-    CudaArray aMatrix;
-    CudaArray bMatrix;
-    CudaArray gMatrix;
-    CudaArray exclusions;
-    CudaArray exclusionStartIndex;
-    CudaArray blockCenter;
-    CudaArray blockBoundingBox;
-    CudaArray neighbors;
-    CudaArray neighborIndex;
-    CudaArray neighborBlockCount;
-    CudaArray sortedPos;
-    CudaArray torque;
-    std::vector<bool> isRealParticle;
-    std::vector<std::pair<int, int> > exceptionAtoms;
-    std::vector<std::pair<int, int> > excludedPairs;
-    std::vector<void*> framesArgs, blockBoundsArgs, neighborsArgs, forceArgs, torqueArgs;
-    CUfunction framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel;
-    CUevent event;
-};
-
-/**
- * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
-public:
-    CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcCustomCVForceKernel(name, platform),
-            cu(cu), hasInitializedListeners(false) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomCVForce this kernel will be used for
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     */
-    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
-    /**
-     * Copy state information to the inner context.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param innerContext   the context created by the CustomCVForce for computing collective variables
-     */
-    void copyState(ContextImpl& context, ContextImpl& innerContext);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomCVForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomCVForce& force);
-private:
-    class ForceInfo;
-    class ReorderListener;
+    class ReorderListener;
    CudaContext& cu;
    bool hasInitializedListeners;
    Lepton::ExpressionProgram energyExpression;
@@ -1282,438 +447,107 @@ private:
    CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
 };

-/**
- * This kernel is invoked by RMSDForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcRMSDForceKernel : public CalcRMSDForceKernel {
-public:
-    CudaCalcRMSDForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcRMSDForceKernel(name, platform), cu(cu) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the RMSDForce this kernel will be used for
-     */
-    void initialize(const System& system, const RMSDForce& force);
-    /**
-     * Record the reference positions and particle indices.
-     */
-    void recordParameters(const RMSDForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * This is the internal implementation of execute(), templatized on whether we're
-     * using single or double precision.
-     */
-    template <class REAL>
-    double executeImpl(ContextImpl& context);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the RMSDForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const RMSDForce& force);
-private:
-    class ForceInfo;
-    CudaContext& cu;
-    ForceInfo* info;
-    double sumNormRef;
-    CudaArray referencePos;
-    CudaArray particles;
-    CudaArray buffer;
-    CUfunction kernel1, kernel2;
-};
-
-/**
- * This kernel is invoked by VerletIntegrator to take one time step.
+/*
+ * This kernel is invoked by NoseHooverIntegrator to take one time step.
 */
-class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
+class CudaIntegrateVelocityVerletStepKernel : public IntegrateVelocityVerletStepKernel {
 public:
-    CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVerletStepKernel(name, platform), cu(cu) {
-    }
+    CudaIntegrateVelocityVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) :
+                                  IntegrateVelocityVerletStepKernel(name, platform), cu(cu) { }
+    ~CudaIntegrateVelocityVerletStepKernel() {}
    /**
     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param integrator the VerletIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const VerletIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VerletIntegrator this kernel is being used for
-     */
-    void execute(ContextImpl& context, const VerletIntegrator& integrator);
-    /**
-     * Compute the kinetic energy.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VerletIntegrator this kernel is being used for
-     */
-    double computeKineticEnergy(ContextImpl& context, const VerletIntegrator& integrator);
-private:
-    CudaContext& cu;
-    CUfunction kernel1, kernel2;
-};
-
-/**
- * This kernel is invoked by LangevinIntegrator to take one time step.
- */
-class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
-public:
-    CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateLangevinStepKernel(name, platform), cu(cu) {
-    }
-    /**
-     * Initialize the kernel, setting up the particle masses.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param integrator the LangevinIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const LangevinIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param integrator the LangevinIntegrator this kernel is being used for
-     */
-    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
-    /**
-     * Compute the kinetic energy.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the LangevinIntegrator this kernel is being used for
-     */
-    double computeKineticEnergy(ContextImpl& context, const LangevinIntegrator& integrator);
-private:
-    CudaContext& cu;
-    double prevTemp, prevFriction, prevStepSize;
-    CudaArray params;
-    CUfunction kernel1, kernel2;
-};
-
-/**
- * This kernel is invoked by BAOABLangevinIntegrator to take one time step.
- */
-class CudaIntegrateBAOABStepKernel : public IntegrateBAOABStepKernel {
-public:
-    CudaIntegrateBAOABStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateBAOABStepKernel(name, platform), cu(cu) {
-    }
-    /**
-     * Initialize the kernel, setting up the particle masses.
     * 
     * @param system     the System this kernel will be applied to
-     * @param integrator the BAOABLangevinIntegrator this kernel will be used for
+     * @param integrator the NoseHooverIntegrator this kernel will be used for
     */
-    void initialize(const System& system, const BAOABLangevinIntegrator& integrator);
+    void initialize(const System& system, const NoseHooverIntegrator& integrator);
    /**
     * Execute the kernel.
     * 
     * @param context    the context in which to execute this kernel
-     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
-     * @param forcesAreValid if the context has been modified since the last time step, this will be
-     *                       false to show that cached forces are invalid and must be recalculated.
-     *                       On exit, this should specify whether the cached forces are valid at the
-     *                       end of the step.
+     * @param integrator the VerletIntegrator this kernel is being used for
+     * @param forcesAreValid a reference to the parent integrator's boolean for keeping
+     *                       track of the validity of the current forces.
     */
-    void execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid);
+    void execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid);
    /**
     * Compute the kinetic energy.
     * 
     * @param context    the context in which to execute this kernel
-     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
+     * @param integrator the NoseHooverIntegrator this kernel is being used for
     */
-    double computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator);
+    double computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator);
 private:
    CudaContext& cu;
-    double prevTemp, prevFriction, prevStepSize;
-    CudaArray params, oldDelta;
-    CUfunction kernel1, kernel2, kernel3, kernel4;
+    float prevMaxPairDistance;
+    CudaArray maxPairDistanceBuffer, pairListBuffer, atomListBuffer, pairTemperatureBuffer;
+    CUfunction kernel1, kernel2, kernel3, kernelHardWall;
 };

 /**
- * This kernel is invoked by BrownianIntegrator to take one time step.
+ * This kernel is invoked by NoseHooverChain at the start of each time step to adjust the thermostat
+ * and update the associated particle velocities.
 */
-class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
+class CudaNoseHooverChainKernel : public NoseHooverChainKernel {
 public:
-    CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateBrownianStepKernel(name, platform), cu(cu) {
+    CudaNoseHooverChainKernel(std::string name, const Platform& platform, CudaContext& cu) : NoseHooverChainKernel(name, platform), cu(cu) {
    }
+    ~CudaNoseHooverChainKernel() {}
    /**
     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param integrator the BrownianIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const BrownianIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param integrator the BrownianIntegrator this kernel is being used for
     */
-    void execute(ContextImpl& context, const BrownianIntegrator& integrator);
+    void initialize();
    /**
-     * Compute the kinetic energy.
+     * Execute the kernel that propagates the Nose Hoover chain and determines the velocity scale factor.
     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the BrownianIntegrator this kernel is being used for
-     */
-    double computeKineticEnergy(ContextImpl& context, const BrownianIntegrator& integrator);
-private:
-    CudaContext& cu;
-    double prevTemp, prevFriction, prevStepSize;
-    CUfunction kernel1, kernel2;
-};
-
-/**
- * This kernel is invoked by VariableVerletIntegrator to take one time step.
- */
-class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
-public:
-    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVariableVerletStepKernel(name, platform), cu(cu) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param integrator the VariableVerletIntegrator this kernel will be used for
+     * @param context  the context in which to execute this kernel
+     * @param noseHooverChain the object describing the chain to be propagated.
+     * @param kineticEnergies the {absolute, relative} kineticEnergy of the particles being thermostated by this chain.
+     * @param timeStep the time step used by the integrator.
+     * @return the {absolute, relative} velocity scale factor to apply to the particles associated with this heat bath.
     */
-    void initialize(const System& system, const VariableVerletIntegrator& integrator);
+    std::pair<double, double> propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep);
    /**
-     * Execute the kernel.
+     * Execute the kernal that computes the total (kinetic + potential) heat bath energy.
     *
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VariableVerletIntegrator this kernel is being used for
-     * @param maxTime    the maximum time beyond which the simulation should not be advanced
-     * @return the size of the step that was taken
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @return the total heat bath energy.
     */
-    double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
-    /**
-     * Compute the kinetic energy.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VariableVerletIntegrator this kernel is being used for
-     */
-    double computeKineticEnergy(ContextImpl& context, const VariableVerletIntegrator& integrator);
-private:
-    CudaContext& cu;
-    int blockSize;
-    CUfunction kernel1, kernel2, selectSizeKernel;
-};
-
-/**
- * This kernel is invoked by VariableLangevinIntegrator to take one time step.
- */
-class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
-public:
-    CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVariableLangevinStepKernel(name, platform), cu(cu) {
-    }
+    double computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc);
    /**
-     * Initialize the kernel, setting up the particle masses.
+     * Execute the kernel that computes the kinetic energy for a subset of atoms,
+     * or the relative kinetic energy of Drude particles with respect to their parent atoms
     *
-     * @param system     the System this kernel will be applied to
-     * @param integrator the VariableLangevinIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const VariableLangevinIntegrator& integrator);
-    /**
-     * Execute the kernel.
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param downloadValue whether the computed value should be downloaded and returned.
     *
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VariableLangevinIntegrator this kernel is being used for
-     * @param maxTime    the maximum time beyond which the simulation should not be advanced
-     * @return the size of the step that was taken
-     */
-    double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
-    /**
-     * Compute the kinetic energy.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VariableLangevinIntegrator this kernel is being used for
     */
-    double computeKineticEnergy(ContextImpl& context, const VariableLangevinIntegrator& integrator);
-private:
-    CudaContext& cu;
-    int blockSize;
-    CudaArray params;
-    CUfunction kernel1, kernel2, selectSizeKernel;
-    double prevTemp, prevFriction, prevErrorTol;
-};
+    std::pair<double,double> computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain, bool downloadValue);

-/**
- * This kernel is invoked by CustomIntegrator to take one time step.
- */
-class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
-public:
-    enum GlobalTargetType {DT, VARIABLE, PARAMETER};
-    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
-            hasInitializedKernels(false), needsEnergyParamDerivs(false) {
-    }
    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param integrator the CustomIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const CustomIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the CustomIntegrator this kernel is being used for
-     * @param forcesAreValid if the context has been modified since the last time step, this will be
-     *                       false to show that cached forces are invalid and must be recalculated.
-     *                       On exit, this should specify whether the cached forces are valid at the
-     *                       end of the step.
-     */
-    void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
-    /**
-     * Compute the kinetic energy.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the CustomIntegrator this kernel is being used for
-     * @param forcesAreValid if the context has been modified since the last time step, this will be
-     *                       false to show that cached forces are invalid and must be recalculated.
-     *                       On exit, this should specify whether the cached forces are valid at the
-     *                       end of the step.
-     */
-    double computeKineticEnergy(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
-    /**
-     * Get the values of all global variables.
-     *
-     * @param context   the context in which to execute this kernel
-     * @param values    on exit, this contains the values
-     */
-    void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
-    /**
-     * Set the values of all global variables.
-     *
-     * @param context   the context in which to execute this kernel
-     * @param values    a vector containing the values
-     */
-    void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
-    /**
-     * Get the values of a per-DOF variable.
+     * Execute the kernel that scales the velocities of particles associated with a nose hoover chain
     *
-     * @param context   the context in which to execute this kernel
-     * @param variable  the index of the variable to get
-     * @param values    on exit, this contains the values
+     * @param context the context in which to execute this kernel
+     * @param noseHooverChain the chain whose energy is to be determined.
+     * @param scaleFactors the {absolute, relative} multiplicative factor by which velocities are scaled.
     */
-    void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
-    /**
-     * Set the values of a per-DOF variable.
-     *
-     * @param context   the context in which to execute this kernel
-     * @param variable  the index of the variable to get
-     * @param values    a vector containing the values
-     */
-    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
-private:
-    class ReorderListener;
-    class GlobalTarget;
-    class DerivFunction;
-    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator,
-        const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
-        std::vector<std::pair<std::string, std::string> >& functionNames);
-    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
-    Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
-    void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
-    void recordGlobalValue(double value, GlobalTarget target, CustomIntegrator& integrator);
-    void recordChangedParameters(ContextImpl& context);
-    bool evaluateCondition(int step);
-    CudaContext& cu;
-    double energy;
-    float energyFloat;
-    int numGlobalVariables, sumWorkGroupSize;
-    bool hasInitializedKernels, deviceGlobalsAreCurrent, modifiesParameters, hasAnyConstraints, needsEnergyParamDerivs;
-    std::vector<bool> deviceValuesAreCurrent;
-    mutable std::vector<bool> localValuesAreCurrent; 
-    CudaArray globalValues;
-    CudaArray sumBuffer;
-    CudaArray summedValue;
-    CudaArray uniformRandoms;
-    CudaArray randomSeed;
-    CudaArray perDofEnergyParamDerivs;
-    std::vector<CudaArray> tabulatedFunctions, perDofValues;
-    std::map<int, double> savedEnergy;
-    std::map<int, CudaArray> savedForces;
-    std::set<int> validSavedForces;
-    mutable std::vector<std::vector<float4> > localPerDofValuesFloat;
-    mutable std::vector<std::vector<double4> > localPerDofValuesDouble;
-    std::map<std::string, double> energyParamDerivs;
-    std::vector<std::string> perDofEnergyParamDerivNames;
-    std::vector<double> localPerDofEnergyParamDerivs;
-    std::vector<double> localGlobalValues;
-    std::vector<double> initialGlobalVariables;
-    std::vector<std::vector<CUfunction> > kernels;
-    std::vector<std::vector<std::vector<void*> > > kernelArgs;
-    std::vector<void*> kineticEnergyArgs;
-    CUfunction randomKernel, kineticEnergyKernel, sumKineticEnergyKernel;
-    std::vector<CustomIntegrator::ComputationType> stepType;
-    std::vector<CustomIntegratorUtilities::Comparison> comparisons;
-    std::vector<std::vector<Lepton::CompiledExpression> > globalExpressions;
-    CompiledExpressionSet expressionSet;
-    std::vector<bool> needsGlobals;
-    std::vector<bool> needsForces;
-    std::vector<bool> needsEnergy;
-    std::vector<bool> computeBothForceAndEnergy;
-    std::vector<bool> invalidatesForces;
-    std::vector<bool> merged;
-    std::vector<int> forceGroupFlags;
-    std::vector<int> blockEnd;
-    std::vector<int> requiredGaussian;
-    std::vector<int> requiredUniform;
-    std::vector<int> stepEnergyVariableIndex;
-    std::vector<int> globalVariableIndex;
-    std::vector<int> parameterVariableIndex;
-    int gaussianVariableIndex, uniformVariableIndex, dtVariableIndex;
-    std::vector<std::string> parameterNames;
-    std::vector<GlobalTarget> stepTarget;
-};
+    void scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, std::pair<double, double> scaleFactors);

-class CudaIntegrateCustomStepKernel::GlobalTarget {
-public:
-    CudaIntegrateCustomStepKernel::GlobalTargetType type;
-    int variableIndex;
-    GlobalTarget() {
-    }
-    GlobalTarget(CudaIntegrateCustomStepKernel::GlobalTargetType type, int variableIndex) : type(type), variableIndex(variableIndex) {
-    }
-};
-
-/**
- * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.
- */
-class CudaApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
-public:
-    CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyAndersenThermostatKernel(name, platform), cu(cu) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param thermostat the AndersenThermostat this kernel will be used for
-     */
-    void initialize(const System& system, const AndersenThermostat& thermostat);
-    /**
-     * Execute the kernel.
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void execute(ContextImpl& context);
 private:
+    int sumWorkGroupSize;
    CudaContext& cu;
-    int randomSeed;
-    CudaArray atomGroups;
-    CUfunction kernel;
+    CudaArray energyBuffer, scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
+    std::map<int, CudaArray> atomlists, pairlists;
+    std::map<int, CUfunction> propagateKernels;
+    CUfunction reduceEnergyKernel;
+    CUfunction computeHeatBathEnergyKernel;
+    CUfunction computeAtomsKineticEnergyKernel;
+    CUfunction computePairsKineticEnergyKernel;
+    CUfunction scaleAtomsVelocitiesKernel;
+    CUfunction scalePairsVelocitiesKernel;
 };

 /**
@@ -1763,33 +597,6 @@ private:
    std::vector<int> lastAtomOrder;
 };

-/**
- * This kernel is invoked to remove center of mass motion from the system.
- */
-class CudaRemoveCMMotionKernel : public RemoveCMMotionKernel {
-public:
-    CudaRemoveCMMotionKernel(std::string name, const Platform& platform, CudaContext& cu) : RemoveCMMotionKernel(name, platform), cu(cu) {
-    }
-    /**
-     * Initialize the kernel, setting up the particle masses.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CMMotionRemover this kernel will be used for
-     */
-    void initialize(const System& system, const CMMotionRemover& force);
-    /**
-     * Execute the kernel.
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void execute(ContextImpl& context);
-private:
-    CudaContext& cu;
-    int frequency;
-    CudaArray cmMomentum;
-    CUfunction kernel1, kernel2;
-};
-
 } // namespace OpenMM

 #endif /*OPENMM_CUDAKERNELS_H_*/

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -27,15 +27,18 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "CudaContext.h"
 #include "openmm/System.h"
+#include "CudaArray.h"
 #include "CudaExpressionUtilities.h"
+#include "openmm/common/NonbondedUtilities.h"
+#include <cuda.h>
 #include <sstream>
 #include <string>
 #include <vector>

 namespace OpenMM {
    
+class CudaContext;
 class CudaSort;

 /**
@@ -63,11 +66,23 @@ class CudaSort;
 * by ForceImpls during calcForcesAndEnergy().
 */

-class OPENMM_EXPORT_CUDA CudaNonbondedUtilities {
+class OPENMM_EXPORT_COMMON CudaNonbondedUtilities : public NonbondedUtilities  {
 public:
    class ParameterInfo;
    CudaNonbondedUtilities(CudaContext& context);
    ~CudaNonbondedUtilities();
+    /**
+     * Add a nonbonded interaction to be evaluated by the default interaction kernel.
+     *
+     * @param usesCutoff       specifies whether a cutoff should be applied to this interaction
+     * @param usesPeriodic     specifies whether periodic boundary conditions should be applied to this interaction
+     * @param usesExclusions   specifies whether this interaction uses exclusions.  If this is true, it must have identical exclusions to every other interaction.
+     * @param cutoffDistance   the cutoff distance for this interaction (ignored if usesCutoff is false)
+     * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
+     * @param kernel           the code to evaluate the interaction
+     * @param forceGroup       the force group in which the interaction should be calculated
+     */
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
    /**
     * Add a nonbonded interaction to be evaluated by the default interaction kernel.
     *
@@ -80,14 +95,26 @@ public:
     * @param forceGroup       the force group in which the interaction should be calculated
     * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
     */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList=false);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList);
    /**
     * Add a per-atom parameter that the default interaction kernel may depend on.
     */
+    void addParameter(ComputeParameterInfo parameter);
+    /**
+     * Add a per-atom parameter that the default interaction kernel may depend on.
+     * 
+     * @deprecated Use the version that takes a ComputeParameterInfo instead.
+     */
    void addParameter(const ParameterInfo& parameter);
    /**
     * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
     */
+    void addArgument(ComputeParameterInfo parameter);
+    /**
+     * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
+     * 
+     * @deprecated Use the version that takes a ComputeParameterInfo instead.
+     */
    void addArgument(const ParameterInfo& parameter);
    /**
     * Register that the interaction kernel will be computing the derivative of the potential energy
@@ -108,6 +135,12 @@ public:
     * Initialize this object in preparation for a simulation.
     */
    void initialize(const System& system);
+    /**
+     * Get the number of force buffers required for nonbonded forces.
+     */
+    int getNumForceBuffers() const {
+        return 0;
+    }
    /**
     * Get the number of energy buffers required for nonbonded forces.
     */

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -30,6 +30,7 @@
 #include "CudaPlatform.h"
 #include "CudaContext.h"
 #include "CudaKernels.h"
+#include "openmm/common/CommonKernels.h"

 namespace OpenMM {

@@ -98,8 +99,8 @@ private:
 class CudaParallelCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
 public:
    CudaParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcHarmonicBondForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcHarmonicBondForceKernel&>(kernels[index].getImpl());
+    CommonCalcHarmonicBondForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcHarmonicBondForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -136,8 +137,8 @@ private:
 class CudaParallelCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
 public:
    CudaParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcCustomBondForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcCustomBondForceKernel&>(kernels[index].getImpl());
+    CommonCalcCustomBondForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcCustomBondForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -174,8 +175,8 @@ private:
 class CudaParallelCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
 public:
    CudaParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcHarmonicAngleForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcHarmonicAngleForceKernel&>(kernels[index].getImpl());
+    CommonCalcHarmonicAngleForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcHarmonicAngleForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -212,8 +213,8 @@ private:
 class CudaParallelCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
 public:
    CudaParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcCustomAngleForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcCustomAngleForceKernel&>(kernels[index].getImpl());
+    CommonCalcCustomAngleForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcCustomAngleForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -250,8 +251,8 @@ private:
 class CudaParallelCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
 public:
    CudaParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcPeriodicTorsionForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcPeriodicTorsionForceKernel&>(kernels[index].getImpl());
+    CommonCalcPeriodicTorsionForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcPeriodicTorsionForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -288,8 +289,8 @@ private:
 class CudaParallelCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
 public:
    CudaParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcRBTorsionForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcRBTorsionForceKernel&>(kernels[index].getImpl());
+    CommonCalcRBTorsionForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcRBTorsionForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -326,8 +327,8 @@ private:
 class CudaParallelCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
 public:
    CudaParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcCMAPTorsionForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcCMAPTorsionForceKernel&>(kernels[index].getImpl());
+    CommonCalcCMAPTorsionForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcCMAPTorsionForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -364,8 +365,8 @@ private:
 class CudaParallelCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
 public:
    CudaParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcCustomTorsionForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcCustomTorsionForceKernel&>(kernels[index].getImpl());
+    CommonCalcCustomTorsionForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcCustomTorsionForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -460,8 +461,8 @@ private:
 class CudaParallelCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
 public:
    CudaParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcCustomNonbondedForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcCustomNonbondedForceKernel&>(kernels[index].getImpl());
+    CommonCalcCustomNonbondedForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcCustomNonbondedForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -498,8 +499,8 @@ private:
 class CudaParallelCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
 public:
    CudaParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcCustomExternalForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcCustomExternalForceKernel&>(kernels[index].getImpl());
+    CommonCalcCustomExternalForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcCustomExternalForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -536,8 +537,8 @@ private:
 class CudaParallelCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
 public:
    CudaParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcCustomHbondForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcCustomHbondForceKernel&>(kernels[index].getImpl());
+    CommonCalcCustomHbondForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcCustomHbondForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.
@@ -574,8 +575,8 @@ private:
 class CudaParallelCalcCustomCompoundBondForceKernel : public CalcCustomCompoundBondForceKernel {
 public:
    CudaParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system);
-    CudaCalcCustomCompoundBondForceKernel& getKernel(int index) {
-        return dynamic_cast<CudaCalcCustomCompoundBondForceKernel&>(kernels[index].getImpl());
+    CommonCalcCustomCompoundBondForceKernel& getKernel(int index) {
+        return dynamic_cast<CommonCalcCustomCompoundBondForceKernel&>(kernels[index].getImpl());
    }
    /**
     * Initialize the kernel.

--- a/platforms/cuda/include/CudaParameterSet.h
+++ b/platforms/cuda/include/CudaParameterSet.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -29,18 +29,18 @@

 #include "CudaContext.h"
 #include "CudaNonbondedUtilities.h"
+#include "openmm/common/ComputeParameterSet.h"

 namespace OpenMM {

 class CudaNonbondedUtilities;

 /**
- * This class represents a set of floating point parameter values for a set of objects (particles, bonds, etc.).
- * It automatically creates an appropriate set of device buffers to hold the parameter values, based
- * on the number of parameters required.
+ * This class exists for backward compatibility.  For most purposes you can use
+ * ComputeParameterSet directly instead.
 */

-class OPENMM_EXPORT_CUDA CudaParameterSet {
+class OPENMM_EXPORT_COMMON CudaParameterSet : public ComputeParameterSet {
 public:
    /**
     * Create an CudaParameterSet.
@@ -54,33 +54,6 @@ public:
     * @param useDoublePrecision  whether values should be stored as single or double precision
     */
    CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
-    ~CudaParameterSet();
-    /**
-     * Get the number of parameters.
-     */
-    int getNumParameters() const {
-        return numParameters;
-    }
-    /**
-     * Get the number of objects.
-     */
-    int getNumObjects() const {
-        return numObjects;
-    }
-    /**
-     * Get the values of all parameters.
-     *
-     * @param values on exit, values[i][j] contains the value of parameter j for object i
-     */
-    template <class T>
-    void getParameterValues(std::vector<std::vector<T> >& values);
-    /**
-     * Set the values of all parameters.
-     *
-     * @param values values[i][j] contains the value of parameter j for object i
-     */
-    template <class T>
-    void setParameterValues(const std::vector<std::vector<T> >& values);
    /**
     * Get a set of CudaNonbondedUtilities::ParameterInfo objects which describe the Buffers
     * containing the data.
@@ -88,18 +61,7 @@ public:
    std::vector<CudaNonbondedUtilities::ParameterInfo>& getBuffers() {
        return buffers;
    }
-    /**
-     * Get a suffix to add to variable names when accessing a certain parameter.
-     *
-     * @param index         the index of the parameter
-     * @param extraSuffix   an extra suffix to add to the variable name
-     * @return the suffix to append
-     */
-    std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
 private:
-    CudaContext& context;
-    int numParameters, numObjects, elementSize;
-    std::string name;
    std::vector<CudaNonbondedUtilities::ParameterInfo> buffers;
 };