Merge https://github.com/openmm/openmm

5a06df78 · tic20 · 8dd60914 · a9223eea · 5a06df78 · 5a06df78
Commit 5a06df78 authored Mar 04, 2020 by tic20
20 changed files
--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -30,7 +30,7 @@
 #include "openmm/Platform.h"
 #include "openmm/System.h"
 #include "openmm/internal/ThreadPool.h"
-#include "windowsExportCuda.h"
+#include "openmm/common/windowsExportCommon.h"

 namespace OpenMM {
    
@@ -40,7 +40,7 @@ class CudaContext;
 * This Platform subclass uses CUDA implementations of the OpenMM kernels.
 */

-class OPENMM_EXPORT_CUDA CudaPlatform : public Platform {
+class OPENMM_EXPORT_COMMON CudaPlatform : public Platform {
 public:
    class PlatformData;
    CudaPlatform();
@@ -127,7 +127,7 @@ public:
    }
 };

-class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
+class OPENMM_EXPORT_COMMON CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,

--- a/platforms/cuda/include/CudaProgram.h
+++ b/platforms/cuda/include/CudaProgram.h
+#ifndef OPENMM_CUDAPROGRAM_H_
+#define OPENMM_CUDAPROGRAM_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/ComputeProgram.h"
+#include "CudaContext.h"
+
+namespace OpenMM {
+
+/**
+ * This is the CUDA implementation of the ComputeProgramImpl interface. 
+ */
+
+class CudaProgram : public ComputeProgramImpl {
+public:
+    /**
+     * Create a new CudaProgram.
+     * 
+     * @param context      the context this kernel belongs to
+     * @param module       the compiled module
+     */
+    CudaProgram(CudaContext& context, CUmodule module);
+    /**
+     * Create a ComputeKernel for one of the kernels in this program.
+     * 
+     * @param name    the name of the kernel to get
+     */
+    ComputeKernel createKernel(const std::string& name);
+private:
+    CudaContext& context;
+    CUmodule module;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CUDAPROGRAM_H_*/
--- a/platforms/cuda/include/CudaSort.h
+++ b/platforms/cuda/include/CudaSort.h
@@ -28,7 +28,7 @@
 * -------------------------------------------------------------------------- */

 #include "CudaArray.h"
-#include "windowsExportCuda.h"
+#include "openmm/common/windowsExportCommon.h"
 #include "CudaContext.h"

 namespace OpenMM {
@@ -66,7 +66,7 @@ namespace OpenMM {
 * elements).
 */
    
-class OPENMM_EXPORT_CUDA CudaSort {
+class OPENMM_EXPORT_COMMON CudaSort {
 public:
    class SortTrait;
    /**

--- a/platforms/cuda/include/windowsExportCuda.h
+++ b/platforms/cuda/include/windowsExportCuda.h
-#ifndef OPENMM_WINDOWSEXPORTCUDA_H_
-#define OPENMM_WINDOWSEXPORTCUDA_H_
-
-/*
- * Shared libraries are messy in Visual Studio. We have to distinguish three
- * cases:
- *   (1) this header is being used to build the OpenMM shared library
- *       (dllexport)
- *   (2) this header is being used by a *client* of the OpenMM shared
- *       library (dllimport)
- *   (3) we are building the OpenMM static library, or the client is
- *       being compiled with the expectation of linking with the
- *       OpenMM static library (nothing special needed)
- * In the CMake script for building this library, we define one of the symbols
- *     OPENMM_CUDA_BUILDING_{SHARED|STATIC}_LIBRARY
- * Client code normally has no special symbol defined, in which case we'll
- * assume it wants to use the shared library. However, if the client defines
- * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
- * that the client code can be linked with static libraries. Note that
- * the client symbol is not library dependent, while the library symbols
- * affect only the OpenMM library, meaning that other libraries can
- * be clients of this one. However, we are assuming all-static or all-shared.
- */
-
-#ifdef _MSC_VER
-    // We don't want to hear about how sprintf is "unsafe".
-    #pragma warning(disable:4996)
-    // Keep MS VC++ quiet about lack of dll export of private members.
-    #pragma warning(disable:4251)
-    #if defined(OPENMM_CUDA_BUILDING_SHARED_LIBRARY)
-        #define OPENMM_EXPORT_CUDA __declspec(dllexport)
-    #elif defined(OPENMM_CUDA_BUILDING_STATIC_LIBRARY) || defined(OPENMM_CUDA_USE_STATIC_LIBRARIES)
-        #define OPENMM_EXPORT_CUDA
-    #else
-        #define OPENMM_EXPORT_CUDA __declspec(dllimport)   // i.e., a client of a shared library
-    #endif
-#else
-    #define OPENMM_EXPORT_CUDA // Linux, Mac
-#endif
-
-#endif // OPENMM_WINDOWSEXPORTCUDA_H_
--- a/platforms/cuda/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda/sharedTarget/CMakeLists.txt
@@ -4,17 +4,18 @@
 INCLUDE(FindCUDA)
 INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})

-FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
-ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
+FILE(GLOB CUDA_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cu)
+ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
    COMMAND ${CMAKE_COMMAND}
-    ARGS -D CUDA_SOURCE_DIR=${CUDA_SOURCE_DIR} -D CUDA_KERNELS_CPP=${CUDA_KERNELS_CPP} -D CUDA_KERNELS_H=${CUDA_KERNELS_H} -D CUDA_SOURCE_CLASS=${CUDA_SOURCE_CLASS} -P ${CMAKE_CURRENT_SOURCE_DIR}/../EncodeCUDAFiles.cmake
+    ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cu -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
    DEPENDS ${CUDA_KERNELS}
 )
-SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
+SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} ${COMMON_KERNELS_CPP} PROPERTIES GENERATED TRUE)
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+ADD_DEPENDENCIES(${SHARED_TARGET} CommonKernels)

 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${CUDA_CUDA_LIBRARY} ${CUDA_cufft_LIBRARY} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDA_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_SHARED_LIBRARY")
 IF (APPLE)
    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
 ELSE (APPLE)

--- a/platforms/cuda/src/CudaArray.cpp
+++ b/platforms/cuda/src/CudaArray.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2012-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2012-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -51,10 +51,10 @@ CudaArray::~CudaArray() {
    }
 }

-void CudaArray::initialize(CudaContext& context, int size, int elementSize, const std::string& name) {
+void CudaArray::initialize(ComputeContext& context, int size, int elementSize, const std::string& name) {
    if (this->pointer != 0)
        throw OpenMMException("CudaArray has already been initialized");
-    this->context = &context;
+    this->context = &dynamic_cast<CudaContext&>(context);
    this->size = size;
    this->elementSize = elementSize;
    this->name = name;
@@ -82,6 +82,10 @@ void CudaArray::resize(int size) {
    initialize(*context, size, elementSize, name);
 }

+ComputeContext& CudaArray::getContext() {
+    return *context;
+}
+
 void CudaArray::upload(const void* data, bool blocking) {
    if (pointer == 0)
        throw OpenMMException("CudaArray has not been initialized");
@@ -112,12 +116,13 @@ void CudaArray::download(void* data, bool blocking) const {
    }
 }

-void CudaArray::copyTo(CudaArray& dest) const {
+void CudaArray::copyTo(ArrayInterface& dest) const {
    if (pointer == 0)
        throw OpenMMException("CudaArray has not been initialized");
    if (dest.getSize() != size || dest.getElementSize() != elementSize)
        throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
-    CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context->getCurrentStream());
+    CudaArray& cuDest = context->unwrap(dest);
+    CUresult result = cuMemcpyDtoDAsync(cuDest.getDevicePointer(), pointer, size*elementSize, context->getCurrentStream());
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
        str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";

--- a/platforms/cuda/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda/src/CudaBondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -25,6 +25,7 @@
 * -------------------------------------------------------------------------- */

 #include "CudaBondedUtilities.h"
+#include "CudaContext.h"
 #include "CudaExpressionUtilities.h"
 #include "CudaKernelSources.h"
 #include "openmm/OpenMMException.h"
@@ -52,6 +53,10 @@ string CudaBondedUtilities::addArgument(CUdeviceptr data, const string& type) {
    return "customArg"+context.intToString(arguments.size());
 }

+string CudaBondedUtilities::addArgument(ArrayInterface& data, const string& type) {
+    return addArgument(context.unwrap(data).getDevicePointer(), type);
+}
+
 string CudaBondedUtilities::addEnergyParameterDerivative(const string& param) {
    // See if the parameter has already been added.
    

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -31,14 +31,14 @@
 #include "CudaContext.h"
 #include "CudaArray.h"
 #include "CudaBondedUtilities.h"
-#include "CudaForceInfo.h"
+#include "CudaEvent.h"
 #include "CudaIntegrationUtilities.h"
 #include "CudaKernels.h"
 #include "CudaKernelSources.h"
 #include "CudaNonbondedUtilities.h"
+#include "CudaProgram.h"
+#include "openmm/common/ComputeArray.h"
 #include "SHA1.h"
-#include "hilbert.h"
-#include "openmm/OpenMMException.h"
 #include "openmm/Platform.h"
 #include "openmm/System.h"
 #include "openmm/VirtualSite.h"
@@ -106,9 +106,9 @@ static int executeInWindows(const string &command) {
 #endif

 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
-        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
-        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasAssignedPosqCharges(false),
-        hasCompilerKernel(false), isNvccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : ComputeContext(system), currentStream(0),
+        platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
+        hasCompilerKernel(false), isNvccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL) {
    // Determine what compiler to use.
    
    this->compiler = "\""+compiler+"\"";
@@ -218,7 +218,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    }

    int major, minor;
-    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
+    CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+    CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
    int numThreadBlocksPerComputeUnit = (major == 6 ? 4 : 6);
    if (cudaDriverVersion < 7000) {
        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
@@ -257,7 +258,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
-    if (computeCapability >= 7.0) {
+    if (cudaDriverVersion >= 9000) {
        compilationDefines["SYNC_WARPS"] = "__syncwarp();";
        compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);";
        compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);";
@@ -300,7 +301,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed3"] = "make_float3";
        compilationDefines["make_mixed4"] = "make_float4";
    }
-    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
+    force.initialize<long long>(*this, paddedNumAtoms*3, "force");
+    posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
    atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
@@ -396,10 +398,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            "pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
    }

-    // Create the work thread used for parallelization when running on multiple devices.
-
-    thread = new WorkThread();
-
    // Create utilities objects.

    bonded = new CudaBondedUtilities(*this);
@@ -428,8 +426,6 @@ CudaContext::~CudaContext() {
        delete bonded;
    if (nonbonded != NULL)
        delete nonbonded;
-    if (thread != NULL)
-        delete thread;
    string errorMessage = "Error deleting Context";
    if (contextIsValid && !isLinkedContext) {
        cuProfilerStop();
@@ -469,7 +465,6 @@ void CudaContext::initialize() {
    }
    velm.upload(pinnedBuffer);
    bonded->initialize(system);
-    force.initialize<long long>(*this, paddedNumAtoms*3, "force");
    addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
    addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
    int numEnergyParamDerivs = energyParamDerivNames.size();
@@ -484,12 +479,8 @@ void CudaContext::initialize() {
    nonbonded->initialize(system);
 }

-void CudaContext::addForce(CudaForceInfo* force) {
-    forces.push_back(force);
-}
-
-vector<CudaForceInfo*>& CudaContext::getForceInfos() {
-    return forces;
+void CudaContext::initializeContexts() {
+    getPlatformData().initializeContexts(system);
 }

 void CudaContext::setAsCurrent() {
@@ -497,38 +488,6 @@ void CudaContext::setAsCurrent() {
        cuCtxSetCurrent(context);
 }

-string CudaContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
-    static set<char> symbolChars;
-    if (symbolChars.size() == 0) {
-        symbolChars.insert('_');
-        for (char c = 'a'; c <= 'z'; c++)
-            symbolChars.insert(c);
-        for (char c = 'A'; c <= 'Z'; c++)
-            symbolChars.insert(c);
-        for (char c = '0'; c <= '9'; c++)
-            symbolChars.insert(c);
-    }
-    string result = input;
-    for (auto& pair : replacements) {
-        int index = 0;
-        int size = pair.first.size();
-        do {
-            index = result.find(pair.first, index);
-            if (index != result.npos) {
-                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
-                    // We have found a complete symbol, not part of a longer symbol.
-
-                    result.replace(index, size, pair.second);
-                    index += pair.second.size();
-                }
-                else
-                    index++;
-            }
-        } while (index != result.npos);
-    }
-    return result;
-}
-
 CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) {
    return createModule(source, map<string, string>(), optimizationFlags);
 }
@@ -572,6 +531,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        src << "typedef float4 mixed4;\n";
    }
    src << "typedef unsigned int tileflags;\n";
+    src << CudaKernelSources::common << endl;
    for (auto& pair : defines) {
        src << "#define " << pair.first;
        if (!pair.second.empty())
@@ -716,19 +676,29 @@ void CudaContext::restoreDefaultStream() {
    setCurrentStream(0);
 }

-string CudaContext::doubleToString(double value) const {
-    stringstream s;
-    s.precision(useDoublePrecision ? 16 : 8);
-    s << scientific << value;
-    if (!useDoublePrecision)
-        s << "f";
-    return s.str();
+CudaArray* CudaContext::createArray() {
+    return new CudaArray();
 }

-string CudaContext::intToString(int value) const {
-    stringstream s;
-    s << value;
-    return s.str();
+ComputeEvent CudaContext::createEvent() {
+    return shared_ptr<ComputeEventImpl>(new CudaEvent(*this));
+}
+
+ComputeProgram CudaContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
+    CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines);
+    return shared_ptr<ComputeProgramImpl>(new CudaProgram(*this, module));
+}
+
+CudaArray& CudaContext::unwrap(ArrayInterface& array) const {
+    CudaArray* cuarray;
+    ComputeArray* wrapper = dynamic_cast<ComputeArray*>(&array);
+    if (wrapper != NULL)
+        cuarray = dynamic_cast<CudaArray*>(&wrapper->getArray());
+    else
+        cuarray = dynamic_cast<CudaArray*>(&array);
+    if (cuarray == NULL)
+        throw OpenMMException("Array argument is not an CudaArray");
+    return *cuarray;
 }

 std::string CudaContext::getErrorString(CUresult result) {
@@ -763,8 +733,8 @@ int CudaContext::computeThreadBlockSize(double memory, bool preferShared) const
    return threads;
 }

-void CudaContext::clearBuffer(CudaArray& array) {
-    clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
+void CudaContext::clearBuffer(ArrayInterface& array) {
+    clearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
 }

 void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
@@ -773,8 +743,8 @@ void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
    executeKernel(clearBufferKernel, args, words, 128);
 }

-void CudaContext::addAutoclearBuffer(CudaArray& array) {
-    addAutoclearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
+void CudaContext::addAutoclearBuffer(ArrayInterface& array) {
+    addAutoclearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
 }

 void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
@@ -855,523 +825,6 @@ bool CudaContext::requestPosqCharges() {
    return allow;
 }

-/**
- * This class ensures that atom reordering doesn't break virtual sites.
- */
-class CudaContext::VirtualSiteInfo : public CudaForceInfo {
-public:
-    VirtualSiteInfo(const System& system) {
-        for (int i = 0; i < system.getNumParticles(); i++) {
-            if (system.isVirtualSite(i)) {
-                const VirtualSite& vsite = system.getVirtualSite(i);
-                siteTypes.push_back(&typeid(vsite));
-                vector<int> particles;
-                particles.push_back(i);
-                for (int j = 0; j < vsite.getNumParticles(); j++)
-                    particles.push_back(vsite.getParticle(j));
-                siteParticles.push_back(particles);
-                vector<double> weights;
-                if (dynamic_cast<const TwoParticleAverageSite*>(&vsite) != NULL) {
-                    // A two particle average.
-
-                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(vsite);
-                    weights.push_back(site.getWeight(0));
-                    weights.push_back(site.getWeight(1));
-                }
-                else if (dynamic_cast<const ThreeParticleAverageSite*>(&vsite) != NULL) {
-                    // A three particle average.
-
-                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(vsite);
-                    weights.push_back(site.getWeight(0));
-                    weights.push_back(site.getWeight(1));
-                    weights.push_back(site.getWeight(2));
-                }
-                else if (dynamic_cast<const OutOfPlaneSite*>(&vsite) != NULL) {
-                    // An out of plane site.
-
-                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(vsite);
-                    weights.push_back(site.getWeight12());
-                    weights.push_back(site.getWeight13());
-                    weights.push_back(site.getWeightCross());
-                }
-                siteWeights.push_back(weights);
-            }
-        }
-    }
-    int getNumParticleGroups() {
-        return siteTypes.size();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        particles = siteParticles[index];
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        if (siteTypes[group1] != siteTypes[group2])
-            return false;
-        int numParticles = siteWeights[group1].size();
-        if (siteWeights[group2].size() != numParticles)
-            return false;
-        for (int i = 0; i < numParticles; i++)
-            if (siteWeights[group1][i] != siteWeights[group2][i])
-                return false;
-        return true;
-    }
-private:
-    vector<const type_info*> siteTypes;
-    vector<vector<int> > siteParticles;
-    vector<vector<double> > siteWeights;
-};
-
-void CudaContext::findMoleculeGroups() {
-    // The first time this is called, we need to identify all the molecules in the system.
-
-    if (moleculeGroups.size() == 0) {
-        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
-
-        addForce(new VirtualSiteInfo(system));
-
-        // First make a list of every other atom to which each atom is connect by a constraint or force group.
-
-        vector<vector<int> > atomBonds(system.getNumParticles());
-        for (int i = 0; i < system.getNumConstraints(); i++) {
-            int particle1, particle2;
-            double distance;
-            system.getConstraintParameters(i, particle1, particle2, distance);
-            atomBonds[particle1].push_back(particle2);
-            atomBonds[particle2].push_back(particle1);
-        }
-        for (auto force : forces) {
-            for (int j = 0; j < force->getNumParticleGroups(); j++) {
-                vector<int> particles;
-                force->getParticlesInGroup(j, particles);
-                for (int k = 0; k < (int) particles.size(); k++)
-                    for (int m = 0; m < (int) particles.size(); m++)
-                        if (k != m)
-                            atomBonds[particles[k]].push_back(particles[m]);
-            }
-        }
-
-        // Now identify atoms by which molecule they belong to.
-
-        vector<vector<int> > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
-        int numMolecules = atomIndices.size();
-        vector<int> atomMolecule(numAtoms);
-        for (int i = 0; i < (int) atomIndices.size(); i++)
-            for (int j = 0; j < (int) atomIndices[i].size(); j++)
-                atomMolecule[atomIndices[i][j]] = i;
-
-        // Construct a description of each molecule.
-
-        molecules.resize(numMolecules);
-        for (int i = 0; i < numMolecules; i++) {
-            molecules[i].atoms = atomIndices[i];
-            molecules[i].groups.resize(forces.size());
-        }
-        for (int i = 0; i < system.getNumConstraints(); i++) {
-            int particle1, particle2;
-            double distance;
-            system.getConstraintParameters(i, particle1, particle2, distance);
-            molecules[atomMolecule[particle1]].constraints.push_back(i);
-        }
-        for (int i = 0; i < (int) forces.size(); i++)
-            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
-                vector<int> particles;
-                forces[i]->getParticlesInGroup(j, particles);
-                if (particles.size() > 0)
-                    molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
-            }
-    }
-
-    // Sort them into groups of identical molecules.
-
-    vector<Molecule> uniqueMolecules;
-    vector<vector<int> > moleculeInstances;
-    vector<vector<int> > moleculeOffsets;
-    for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
-        Molecule& mol = molecules[molIndex];
-
-        // See if it is identical to another molecule.
-
-        bool isNew = true;
-        for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
-            Molecule& mol2 = uniqueMolecules[j];
-            bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
-
-            // See if the atoms are identical.
-
-            int atomOffset = mol2.atoms[0]-mol.atoms[0];
-            for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
-                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
-                    identical = false;
-                for (int k = 0; k < (int) forces.size(); k++)
-                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
-                        identical = false;
-            }
-
-            // See if the constraints are identical.
-
-            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
-                int c1particle1, c1particle2, c2particle1, c2particle2;
-                double distance1, distance2;
-                system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
-                system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
-                if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
-                    identical = false;
-            }
-
-            // See if the force groups are identical.
-
-            for (int i = 0; i < (int) forces.size() && identical; i++) {
-                if (mol.groups[i].size() != mol2.groups[i].size())
-                    identical = false;
-                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) {
-                    if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
-                        identical = false;
-                    vector<int> p1, p2;
-                    forces[i]->getParticlesInGroup(mol.groups[i][k], p1);
-                    forces[i]->getParticlesInGroup(mol2.groups[i][k], p2);
-                    for (int m = 0; m < p1.size(); m++)
-                        if (p1[m] != p2[m]-atomOffset)
-                            identical = false;
-                }
-            }
-            if (identical) {
-                moleculeInstances[j].push_back(molIndex);
-                moleculeOffsets[j].push_back(mol.atoms[0]);
-                isNew = false;
-            }
-        }
-        if (isNew) {
-            uniqueMolecules.push_back(mol);
-            moleculeInstances.push_back(vector<int>());
-            moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
-            moleculeOffsets.push_back(vector<int>());
-            moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
-        }
-    }
-    moleculeGroups.resize(moleculeInstances.size());
-    for (int i = 0; i < (int) moleculeInstances.size(); i++)
-    {
-        moleculeGroups[i].instances = moleculeInstances[i];
-        moleculeGroups[i].offsets = moleculeOffsets[i];
-        vector<int>& atoms = uniqueMolecules[i].atoms;
-        moleculeGroups[i].atoms.resize(atoms.size());
-        for (int j = 0; j < (int) atoms.size(); j++)
-            moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
-    }
-}
-
-void CudaContext::invalidateMolecules() {
-    for (int i = 0; i < forces.size(); i++)
-        if (invalidateMolecules(forces[i]))
-            return;
-}
-
-bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
-    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
-        return false;
-    bool valid = true;
-    int forceIndex = -1;
-    for (int i = 0; i < forces.size(); i++)
-        if (forces[i] == force)
-            forceIndex = i;
-    getPlatformData().threads.execute([&] (ThreadPool& threads, int threadIndex) {
-        for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
-            MoleculeGroup& mol = moleculeGroups[group];
-            vector<int>& instances = mol.instances;
-            vector<int>& offsets = mol.offsets;
-            vector<int>& atoms = mol.atoms;
-            int numMolecules = instances.size();
-            Molecule& m1 = molecules[instances[0]];
-            int offset1 = offsets[0];
-            int numThreads = threads.getNumThreads();
-            int start = max(1, threadIndex*numMolecules/numThreads);
-            int end = (threadIndex+1)*numMolecules/numThreads;
-            for (int j = start; j < end; j++) {
-                // See if the atoms are identical.
-
-                Molecule& m2 = molecules[instances[j]];
-                int offset2 = offsets[j];
-                for (int i = 0; i < (int) atoms.size() && valid; i++) {
-                    if (!force->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
-                        valid = false;
-                }
-
-                // See if the force groups are identical.
-
-                if (valid && forceIndex > -1) {
-                    for (int k = 0; k < (int) m1.groups[forceIndex].size() && valid; k++)
-                        if (!force->areGroupsIdentical(m1.groups[forceIndex][k], m2.groups[forceIndex][k]))
-                            valid = false;
-                }
-            }
-        }
-    });
-    getPlatformData().threads.waitForThreads();
-    if (valid)
-        return false;
-
-    // The list of which molecules are identical is no longer valid.  We need to restore the
-    // atoms to their original order, rebuild the list of identical molecules, and sort them
-    // again.
-
-    vector<int4> newCellOffsets(numAtoms);
-    if (useDoublePrecision) {
-        vector<double4> oldPosq(paddedNumAtoms);
-        vector<double4> newPosq(paddedNumAtoms, make_double4(0, 0, 0, 0));
-        vector<double4> oldVelm(paddedNumAtoms);
-        vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
-        posq.download(oldPosq);
-        velm.download(oldVelm);
-        for (int i = 0; i < numAtoms; i++) {
-            int index = atomIndex[i];
-            newPosq[index] = oldPosq[i];
-            newVelm[index] = oldVelm[i];
-            newCellOffsets[index] = posCellOffsets[i];
-        }
-        posq.upload(newPosq);
-        velm.upload(newVelm);
-    }
-    else if (useMixedPrecision) {
-        vector<float4> oldPosq(paddedNumAtoms);
-        vector<float4> newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0));
-        vector<float4> oldPosqCorrection(paddedNumAtoms);
-        vector<float4> newPosqCorrection(paddedNumAtoms, make_float4(0, 0, 0, 0));
-        vector<double4> oldVelm(paddedNumAtoms);
-        vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
-        posq.download(oldPosq);
-        velm.download(oldVelm);
-        for (int i = 0; i < numAtoms; i++) {
-            int index = atomIndex[i];
-            newPosq[index] = oldPosq[i];
-            newPosqCorrection[index] = oldPosqCorrection[i];
-            newVelm[index] = oldVelm[i];
-            newCellOffsets[index] = posCellOffsets[i];
-        }
-        posq.upload(newPosq);
-        posqCorrection.upload(newPosqCorrection);
-        velm.upload(newVelm);
-    }
-    else {
-        vector<float4> oldPosq(paddedNumAtoms);
-        vector<float4> newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0));
-        vector<float4> oldVelm(paddedNumAtoms);
-        vector<float4> newVelm(paddedNumAtoms, make_float4(0, 0, 0, 0));
-        posq.download(oldPosq);
-        velm.download(oldVelm);
-        for (int i = 0; i < numAtoms; i++) {
-            int index = atomIndex[i];
-            newPosq[index] = oldPosq[i];
-            newVelm[index] = oldVelm[i];
-            newCellOffsets[index] = posCellOffsets[i];
-        }
-        posq.upload(newPosq);
-        velm.upload(newVelm);
-    }
-    for (int i = 0; i < numAtoms; i++) {
-        atomIndex[i] = i;
-        posCellOffsets[i] = newCellOffsets[i];
-    }
-    atomIndexDevice.upload(atomIndex);
-    findMoleculeGroups();
-    for (auto listener : reorderListeners)
-        listener->execute();
-    reorderAtoms();
-    return true;
-}
-
-void CudaContext::reorderAtoms() {
-    atomsWereReordered = false;
-    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff() || stepsSinceReorder < 250) {
-        stepsSinceReorder++;
-        return;
-    }
-    atomsWereReordered = true;
-    stepsSinceReorder = 0;
-    if (useDoublePrecision)
-        reorderAtomsImpl<double, double4, double, double4>();
-    else if (useMixedPrecision)
-        reorderAtomsImpl<float, float4, double, double4>();
-    else
-        reorderAtomsImpl<float, float4, float, float4>();
-}
-
-template <class Real, class Real4, class Mixed, class Mixed4>
-void CudaContext::reorderAtomsImpl() {
-    // Find the range of positions and the number of bins along each axis.
-
-    Real4 padding = {0, 0, 0, 0};
-    vector<Real4> oldPosq(paddedNumAtoms, padding);
-    vector<Real4> oldPosqCorrection(paddedNumAtoms, padding);
-    Mixed4 paddingMixed = {0, 0, 0, 0};
-    vector<Mixed4> oldVelm(paddedNumAtoms, paddingMixed);
-    posq.download(oldPosq);
-    velm.download(oldVelm);
-    if (useMixedPrecision)
-        posqCorrection.download(oldPosqCorrection);
-    Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
-    Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
-    Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
-    if (nonbonded->getUsePeriodic()) {
-        minx = miny = minz = 0.0;
-        maxx = periodicBoxSize.x;
-        maxy = periodicBoxSize.y;
-        maxz = periodicBoxSize.z;
-    }
-    else {
-        for (int i = 1; i < numAtoms; i++) {
-            const Real4& pos = oldPosq[i];
-            minx = min(minx, pos.x);
-            maxx = max(maxx, pos.x);
-            miny = min(miny, pos.y);
-            maxy = max(maxy, pos.y);
-            minz = min(minz, pos.z);
-            maxz = max(maxz, pos.z);
-        }
-    }
-
-    // Loop over each group of identical molecules and reorder them.
-
-    vector<int> originalIndex(numAtoms);
-    vector<Real4> newPosq(paddedNumAtoms);
-    vector<Real4> newPosqCorrection(paddedNumAtoms);
-    vector<Mixed4> newVelm(paddedNumAtoms);
-    vector<int4> newCellOffsets(numAtoms);
-    for (auto& mol : moleculeGroups) {
-        // Find the center of each molecule.
-
-        int numMolecules = mol.offsets.size();
-        vector<int>& atoms = mol.atoms;
-        vector<Real4> molPos(numMolecules);
-        Real invNumAtoms = (Real) (1.0/atoms.size());
-        for (int i = 0; i < numMolecules; i++) {
-            molPos[i].x = 0.0f;
-            molPos[i].y = 0.0f;
-            molPos[i].z = 0.0f;
-            for (int j = 0; j < (int)atoms.size(); j++) {
-                int atom = atoms[j]+mol.offsets[i];
-                const Real4& pos = oldPosq[atom];
-                molPos[i].x += pos.x;
-                molPos[i].y += pos.y;
-                molPos[i].z += pos.z;
-            }
-            molPos[i].x *= invNumAtoms;
-            molPos[i].y *= invNumAtoms;
-            molPos[i].z *= invNumAtoms;
-            if (molPos[i].x != molPos[i].x)
-                throw OpenMMException("Particle coordinate is nan");
-        }
-        if (nonbonded->getUsePeriodic()) {
-            // Move each molecule position into the same box.
-
-            for (int i = 0; i < numMolecules; i++) {
-                Real4 center = molPos[i];
-                int zcell = (int) floor(center.z*invPeriodicBoxSize.z);
-                center.x -= zcell*periodicBoxVecZ.x;
-                center.y -= zcell*periodicBoxVecZ.y;
-                center.z -= zcell*periodicBoxVecZ.z;
-                int ycell = (int) floor(center.y*invPeriodicBoxSize.y);
-                center.x -= ycell*periodicBoxVecY.x;
-                center.y -= ycell*periodicBoxVecY.y;
-                int xcell = (int) floor(center.x*invPeriodicBoxSize.x);
-                center.x -= xcell*periodicBoxVecX.x;
-                if (xcell != 0 || ycell != 0 || zcell != 0) {
-                    Real dx = molPos[i].x-center.x;
-                    Real dy = molPos[i].y-center.y;
-                    Real dz = molPos[i].z-center.z;
-                    molPos[i] = center;
-                    for (int j = 0; j < (int) atoms.size(); j++) {
-                        int atom = atoms[j]+mol.offsets[i];
-                        Real4 p = oldPosq[atom];
-                        p.x -= dx;
-                        p.y -= dy;
-                        p.z -= dz;
-                        oldPosq[atom] = p;
-                        posCellOffsets[atom].x -= xcell;
-                        posCellOffsets[atom].y -= ycell;
-                        posCellOffsets[atom].z -= zcell;
-                    }
-                }
-            }
-        }
-
-        // Select a bin for each molecule, then sort them by bin.
-
-        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
-        Real binWidth;
-        if (useHilbert)
-            binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
-        else
-            binWidth = (Real) (0.2*nonbonded->getMaxCutoffDistance());
-        Real invBinWidth = (Real) (1.0/binWidth);
-        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
-        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
-        vector<pair<int, int> > molBins(numMolecules);
-        bitmask_t coords[3];
-        for (int i = 0; i < numMolecules; i++) {
-            int x = (int) ((molPos[i].x-minx)*invBinWidth);
-            int y = (int) ((molPos[i].y-miny)*invBinWidth);
-            int z = (int) ((molPos[i].z-minz)*invBinWidth);
-            int bin;
-            if (useHilbert) {
-                coords[0] = x;
-                coords[1] = y;
-                coords[2] = z;
-                bin = (int) hilbert_c2i(3, 8, coords);
-            }
-            else {
-                int yodd = y&1;
-                int zodd = z&1;
-                bin = z*xbins*ybins;
-                bin += (zodd ? ybins-y : y)*xbins;
-                bin += (yodd ? xbins-x : x);
-            }
-            molBins[i] = pair<int, int>(bin, i);
-        }
-        sort(molBins.begin(), molBins.end());
-
-        // Reorder the atoms.
-
-        for (int i = 0; i < numMolecules; i++) {
-            for (int atom : atoms) {
-                int oldIndex = mol.offsets[molBins[i].second]+atom;
-                int newIndex = mol.offsets[i]+atom;
-                originalIndex[newIndex] = atomIndex[oldIndex];
-                newPosq[newIndex] = oldPosq[oldIndex];
-                if (useMixedPrecision)
-                    newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
-                newVelm[newIndex] = oldVelm[oldIndex];
-                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
-            }
-        }
-    }
-
-    // Update the streams.
-
-    for (int i = 0; i < numAtoms; i++) {
-        atomIndex[i] = originalIndex[i];
-        posCellOffsets[i] = newCellOffsets[i];
-    }
-    posq.upload(newPosq);
-    if (useMixedPrecision)
-        posqCorrection.upload(newPosqCorrection);
-    velm.upload(newVelm);
-    atomIndexDevice.upload(atomIndex);
-    for (auto listener : reorderListeners)
-        listener->execute();
-}
-
-void CudaContext::addReorderListener(ReorderListener* listener) {
-    reorderListeners.push_back(listener);
-}
-
-void CudaContext::addPreComputation(ForcePreComputation* computation) {
-    preComputations.push_back(computation);
-}
-
-void CudaContext::addPostComputation(ForcePostComputation* computation) {
-    postComputations.push_back(computation);
-}
-
 void CudaContext::addEnergyParameterDerivative(const string& param) {
    // See if this parameter has already been registered.
    
@@ -1381,90 +834,10 @@ void CudaContext::addEnergyParameterDerivative(const string& param) {
    energyParamDerivNames.push_back(param);
 }

-struct CudaContext::WorkThread::ThreadData {
-    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
-            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
-        tasks(tasks), waiting(waiting), finished(finished), queueLock(queueLock),
-        waitForTaskCondition(waitForTaskCondition), queueEmptyCondition(queueEmptyCondition) {
-    }
-    std::queue<CudaContext::WorkTask*>& tasks;
-    bool& waiting;
-    bool& finished;
-    pthread_mutex_t& queueLock;
-    pthread_cond_t& waitForTaskCondition;
-    pthread_cond_t& queueEmptyCondition;
-};
-
-static void* threadBody(void* args) {
-    CudaContext::WorkThread::ThreadData& data = *reinterpret_cast<CudaContext::WorkThread::ThreadData*>(args);
-    while (!data.finished || data.tasks.size() > 0) {
-        pthread_mutex_lock(&data.queueLock);
-        while (data.tasks.empty() && !data.finished) {
-            data.waiting = true;
-            pthread_cond_signal(&data.queueEmptyCondition);
-            pthread_cond_wait(&data.waitForTaskCondition, &data.queueLock);
-        }
-        CudaContext::WorkTask* task = NULL;
-        if (!data.tasks.empty()) {
-            data.waiting = false;
-            task = data.tasks.front();
-            data.tasks.pop();
-        }
-        pthread_mutex_unlock(&data.queueLock);
-        if (task != NULL) {
-            task->execute();
-            delete task;
-        }
-    }
-    data.waiting = true;
-    pthread_cond_signal(&data.queueEmptyCondition);
-    delete &data;
-    return 0;
-}
-
-CudaContext::WorkThread::WorkThread() : waiting(true), finished(false) {
-    pthread_mutex_init(&queueLock, NULL);
-    pthread_cond_init(&waitForTaskCondition, NULL);
-    pthread_cond_init(&queueEmptyCondition, NULL);
-    ThreadData* data = new ThreadData(tasks, waiting, finished, queueLock, waitForTaskCondition, queueEmptyCondition);
-    pthread_create(&thread, NULL, threadBody, data);
-}
-
-CudaContext::WorkThread::~WorkThread() {
-    pthread_mutex_lock(&queueLock);
-    finished = true;
-    pthread_cond_broadcast(&waitForTaskCondition);
-    pthread_mutex_unlock(&queueLock);
-    pthread_join(thread, NULL);
-    pthread_mutex_destroy(&queueLock);
-    pthread_cond_destroy(&waitForTaskCondition);
-    pthread_cond_destroy(&queueEmptyCondition);
+void CudaContext::flushQueue() {
+    cuStreamSynchronize(getCurrentStream());
 }

-void CudaContext::WorkThread::addTask(CudaContext::WorkTask* task) {
-    pthread_mutex_lock(&queueLock);
-    tasks.push(task);
-    waiting = false;
-    pthread_cond_signal(&waitForTaskCondition);
-    pthread_mutex_unlock(&queueLock);
-}
-
-bool CudaContext::WorkThread::isWaiting() {
-    return waiting;
-}
-
-bool CudaContext::WorkThread::isFinished() {
-    return finished;
-}
-
-void CudaContext::WorkThread::flush() {
-    pthread_mutex_lock(&queueLock);
-    while (!waiting)
-       pthread_cond_wait(&queueEmptyCondition, &queueLock);
-    pthread_mutex_unlock(&queueLock);
-}
-
-
 vector<int> CudaContext::getDevicePrecedence() {
    int numDevices;
    CUdevice thisDevice;
@@ -1475,7 +848,8 @@ vector<int> CudaContext::getDevicePrecedence() {
    for (int i = 0; i < numDevices; i++) {
        CHECK_RESULT(cuDeviceGet(&thisDevice, i));
        int major, minor, clock, multiprocessors, speed;
-        CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
+        CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, thisDevice));
+        CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, thisDevice));
        if (major == 1 && minor < 2)
            continue;


--- a/platforms/cuda/src/CudaEvent.cpp
+++ b/platforms/cuda/src/CudaEvent.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaEvent.h"
+#include "openmm/OpenMMException.h"
+
+using namespace OpenMM;
+
+CudaEvent::CudaEvent(CudaContext& context) : context(context), eventCreated(false) {
+    CUresult result = cuEventCreate(&event, CU_EVENT_DISABLE_TIMING);
+    if (result != CUDA_SUCCESS)
+        throw OpenMMException("Error creating CUDA event:"+CudaContext::getErrorString(result));
+    eventCreated = true;
+}
+
+CudaEvent::~CudaEvent() {
+    if (eventCreated)
+        cuEventDestroy(event);
+}
+
+void CudaEvent::enqueue() {
+    cuEventRecord(event, 0);
+}
+
+void CudaEvent::wait() {
+    cuEventSynchronize(event);
+}
--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -25,18 +25,7 @@
 * -------------------------------------------------------------------------- */

 #include "CudaIntegrationUtilities.h"
-#include "CudaArray.h"
-#include "CudaKernelSources.h"
-#include "openmm/internal/OSRngSeed.h"
-#include "openmm/HarmonicAngleForce.h"
-#include "openmm/VirtualSite.h"
-#include "quern.h"
-#include "CudaExpressionUtilities.h"
-#include "ReferenceCCMAAlgorithm.h"
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <map>
+#include "CudaContext.h"

 using namespace OpenMM;
 using namespace std;
@@ -45,539 +34,39 @@ using namespace std;
 #define CHECK_RESULT2(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
-        m<<prefix<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        m<<prefix<<": "<<dynamic_cast<CudaContext&>(context).getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }

-struct CudaIntegrationUtilities::ShakeCluster {
-    int centralID;
-    int peripheralID[3];
-    int size;
-    bool valid;
-    double distance;
-    double centralInvMass, peripheralInvMass;
-    ShakeCluster() : valid(true) {
-    }
-    ShakeCluster(int centralID, double invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
-    }
-    void addAtom(int id, double dist, double invMass) {
-        if (size == 3 || (size > 0 && abs(dist-distance)/distance > 1e-8) || (size > 0 && abs(invMass-peripheralInvMass)/peripheralInvMass > 1e-8))
-            valid = false;
-        else {
-            peripheralID[size++] = id;
-            distance = dist;
-            peripheralInvMass = invMass;
-        }
-    }
-    void markInvalid(map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
-    {
-        valid = false;
-        invalidForShake[centralID] = true;
-        for (int i = 0; i < size; i++) {
-            invalidForShake[peripheralID[i]] = true;
-            map<int, ShakeCluster>::iterator otherCluster = allClusters.find(peripheralID[i]);
-            if (otherCluster != allClusters.end() && otherCluster->second.valid)
-                otherCluster->second.markInvalid(allClusters, invalidForShake);
-        }
-    }
-};
-
-struct CudaIntegrationUtilities::ConstraintOrderer : public binary_function<int, int, bool> {
-    const vector<int>& atom1;
-    const vector<int>& atom2;
-    const vector<int>& constraints;
-    ConstraintOrderer(const vector<int>& atom1, const vector<int>& atom2, const vector<int>& constraints) : atom1(atom1), atom2(atom2), constraints(constraints) {
-    }
-    bool operator()(int x, int y) {
-        int ix = constraints[x];
-        int iy = constraints[y];
-        if (atom1[ix] != atom1[iy])
-            return atom1[ix] < atom1[iy];
-        return atom2[ix] < atom2[iy];
-    }
-};
-
-CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : context(context),
-        randomPos(0), ccmaConvergedMemory(NULL) {
-    // Create workspace arrays.
-
-    lastStepSize = make_double2(0.0, 0.0);
-    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
-        posDelta.initialize<double4>(context, context.getPaddedNumAtoms(), "posDelta");
-        vector<double4> deltas(posDelta.getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
-        posDelta.upload(deltas);
-        stepSize.initialize<double2>(context, 1, "stepSize");
-        stepSize.upload(&lastStepSize);
-    }
-    else {
-        posDelta.initialize<float4>(context, context.getPaddedNumAtoms(), "posDelta");
-        vector<float4> deltas(posDelta.getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
-        posDelta.upload(deltas);
-        stepSize.initialize<float2>(context, 1, "stepSize");
-        float2 lastStepSizeFloat = make_float2(0.0f, 0.0f);
-        stepSize.upload(&lastStepSizeFloat);
-    }
-
-    // Record the set of constraints and how many constraints each atom is involved in.
-
-    vector<int> atom1;
-    vector<int> atom2;
-    vector<double> distance;
-    vector<int> constraintCount(context.getNumAtoms(), 0);
-    for (int i = 0; i < system.getNumConstraints(); i++) {
-        int p1, p2;
-        double d;
-        system.getConstraintParameters(i, p1, p2, d);
-        if (system.getParticleMass(p1) != 0 || system.getParticleMass(p2) != 0) {
-            atom1.push_back(p1);
-            atom2.push_back(p2);
-            distance.push_back(d);
-            constraintCount[p1]++;
-            constraintCount[p2]++;
-        }
-    }
-
-    // Identify clusters of three atoms that can be treated with SETTLE.  First, for every
-    // atom that might be part of such a cluster, make a list of the two other atoms it is
-    // connected to.
-
-    int numAtoms = system.getNumParticles();
-    vector<map<int, float> > settleConstraints(numAtoms);
-    for (int i = 0; i < (int)atom1.size(); i++) {
-        if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
-            settleConstraints[atom1[i]][atom2[i]] = (float) distance[i];
-            settleConstraints[atom2[i]][atom1[i]] = (float) distance[i];
-        }
-    }
-
-    // Now remove the ones that don't actually form closed loops of three atoms.
-
-    vector<int> settleClusters;
-    for (int i = 0; i < (int)settleConstraints.size(); i++) {
-        if (settleConstraints[i].size() == 2) {
-            int partner1 = settleConstraints[i].begin()->first;
-            int partner2 = (++settleConstraints[i].begin())->first;
-            if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
-                    settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
-                settleConstraints[i].clear();
-            else if (i < partner1 && i < partner2)
-                settleClusters.push_back(i);
-        }
-        else
-            settleConstraints[i].clear();
-    }
-
-    // Record the SETTLE clusters.
-
-    vector<bool> isShakeAtom(numAtoms, false);
-    if (settleClusters.size() > 0) {
-        vector<int4> atoms;
-        vector<float2> params;
-        for (int i = 0; i < (int) settleClusters.size(); i++) {
-            int atom1 = settleClusters[i];
-            int atom2 = settleConstraints[atom1].begin()->first;
-            int atom3 = (++settleConstraints[atom1].begin())->first;
-            float dist12 = settleConstraints[atom1].find(atom2)->second;
-            float dist13 = settleConstraints[atom1].find(atom3)->second;
-            float dist23 = settleConstraints[atom2].find(atom3)->second;
-            if (dist12 == dist13) {
-                // atom1 is the central atom
-                atoms.push_back(make_int4(atom1, atom2, atom3, 0));
-                params.push_back(make_float2(dist12, dist23));
-            }
-            else if (dist12 == dist23) {
-                // atom2 is the central atom
-                atoms.push_back(make_int4(atom2, atom1, atom3, 0));
-                params.push_back(make_float2(dist12, dist13));
-            }
-            else if (dist13 == dist23) {
-                // atom3 is the central atom
-                atoms.push_back(make_int4(atom3, atom1, atom2, 0));
-                params.push_back(make_float2(dist13, dist12));
-            }
-            else
-                continue; // We can't handle this with SETTLE
-            isShakeAtom[atom1] = true;
-            isShakeAtom[atom2] = true;
-            isShakeAtom[atom3] = true;
-        }
-        if (atoms.size() > 0) {
-            settleAtoms.initialize<int4>(context, atoms.size(), "settleAtoms");
-            settleParams.initialize<float2>(context, params.size(), "settleParams");
-            settleAtoms.upload(atoms);
-            settleParams.upload(params);
-        }
-    }
-
-    // Find clusters consisting of a central atom with up to three peripheral atoms.
-
-    map<int, ShakeCluster> clusters;
-    vector<bool> invalidForShake(numAtoms, false);
-    for (int i = 0; i < (int) atom1.size(); i++) {
-        if (isShakeAtom[atom1[i]])
-            continue; // This is being taken care of with SETTLE.
-
-        // Determine which is the central atom.
-
-        bool firstIsCentral;
-        if (constraintCount[atom1[i]] > 1)
-            firstIsCentral = true;
-        else if (constraintCount[atom2[i]] > 1)
-            firstIsCentral = false;
-        else if (atom1[i] < atom2[i])
-            firstIsCentral = true;
-        else
-            firstIsCentral = false;
-        int centralID, peripheralID;
-        if (firstIsCentral) {
-            centralID = atom1[i];
-            peripheralID = atom2[i];
-        }
-        else {
-            centralID = atom2[i];
-            peripheralID = atom1[i];
-        }
-
-        // Add it to the cluster.
-
-        if (clusters.find(centralID) == clusters.end()) {
-            clusters[centralID] = ShakeCluster(centralID, 1.0/system.getParticleMass(centralID));
-        }
-        ShakeCluster& cluster = clusters[centralID];
-        cluster.addAtom(peripheralID, distance[i], 1.0/system.getParticleMass(peripheralID));
-        if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
-            cluster.markInvalid(clusters, invalidForShake);
-            map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
-            if (otherCluster != clusters.end() && otherCluster->second.valid)
-                otherCluster->second.markInvalid(clusters, invalidForShake);
-        }
-    }
-    int validShakeClusters = 0;
-    for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
-        ShakeCluster& cluster = iter->second;
-        if (cluster.valid) {
-            cluster.valid = !invalidForShake[cluster.centralID] && cluster.size == constraintCount[cluster.centralID];
-            for (int i = 0; i < cluster.size; i++)
-                if (invalidForShake[cluster.peripheralID[i]])
-                    cluster.valid = false;
-            if (cluster.valid)
-                ++validShakeClusters;
-        }
-    }
-
-    // Record the SHAKE clusters.
-
-    if (validShakeClusters > 0) {
-        vector<int4> atoms;
-        vector<float4> params;
-        int index = 0;
-        for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
-            const ShakeCluster& cluster = iter->second;
-            if (!cluster.valid)
-                continue;
-            atoms.push_back(make_int4(cluster.centralID, cluster.peripheralID[0], (cluster.size > 1 ? cluster.peripheralID[1] : -1), (cluster.size > 2 ? cluster.peripheralID[2] : -1)));
-            params.push_back(make_float4((float) cluster.centralInvMass, (float) (0.5/(cluster.centralInvMass+cluster.peripheralInvMass)), (float) (cluster.distance*cluster.distance), (float) cluster.peripheralInvMass));
-            isShakeAtom[cluster.centralID] = true;
-            isShakeAtom[cluster.peripheralID[0]] = true;
-            if (cluster.size > 1)
-                isShakeAtom[cluster.peripheralID[1]] = true;
-            if (cluster.size > 2)
-                isShakeAtom[cluster.peripheralID[2]] = true;
-            ++index;
-        }
-        shakeAtoms.initialize<int4>(context, atoms.size(), "shakeAtoms");
-        shakeParams.initialize<float4>(context, params.size(), "shakeParams");
-        shakeAtoms.upload(atoms);
-        shakeParams.upload(params);
-    }
-
-    // Find connected constraints for CCMA.
-
-    vector<int> ccmaConstraints;
-    for (unsigned i = 0; i < atom1.size(); i++)
-        if (!isShakeAtom[atom1[i]])
-            ccmaConstraints.push_back(i);
-
-    // Record the connections between constraints.
-
-    int numCCMA = (int) ccmaConstraints.size();
-    if (numCCMA > 0) {
-        // Record information needed by ReferenceCCMAAlgorithm.
-        
-        vector<pair<int, int> > refIndices(numCCMA);
-        vector<double> refDistance(numCCMA);
-        for (int i = 0; i < numCCMA; i++) {
-            int index = ccmaConstraints[i];
-            refIndices[i] = make_pair(atom1[index], atom2[index]);
-            refDistance[i] = distance[index];
-        }
-        vector<double> refMasses(numAtoms);
-        for (int i = 0; i < numAtoms; ++i)
-            refMasses[i] = system.getParticleMass(i);
-
-        // Look up angles for CCMA.
-        
-        vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
-        for (int i = 0; i < system.getNumForces(); i++) {
-            const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
-            if (force != NULL) {
-                for (int j = 0; j < force->getNumAngles(); j++) {
-                    int atom1, atom2, atom3;
-                    double angle, k;
-                    force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
-                    angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, angle));
-                }
-            }
-        }
-        
-        // Create a ReferenceCCMAAlgorithm.  It will build and invert the constraint matrix for us.
-        
-        ReferenceCCMAAlgorithm ccma(numAtoms, numCCMA, refIndices, refDistance, refMasses, angles, 0.1);
-        vector<vector<pair<int, double> > > matrix = ccma.getMatrix();
-        int maxRowElements = 0;
-        for (unsigned i = 0; i < matrix.size(); i++)
-            maxRowElements = max(maxRowElements, (int) matrix[i].size());
-        maxRowElements++;
-
-        // Build the list of constraints for each atom.
-
-        vector<vector<int> > atomConstraints(context.getNumAtoms());
-        for (int i = 0; i < numCCMA; i++) {
-            atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
-            atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
-        }
-        int maxAtomConstraints = 0;
-        for (unsigned i = 0; i < atomConstraints.size(); i++)
-            maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
-
-        // Sort the constraints.
-
-        vector<int> constraintOrder(numCCMA);
-        for (int i = 0; i < numCCMA; ++i)
-            constraintOrder[i] = i;
-        sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2, ccmaConstraints));
-        vector<int> inverseOrder(numCCMA);
-        for (int i = 0; i < numCCMA; ++i)
-            inverseOrder[constraintOrder[i]] = i;
-        for (int i = 0; i < (int)matrix.size(); ++i)
-            for (int j = 0; j < (int)matrix[i].size(); ++j)
-                matrix[i][j].first = inverseOrder[matrix[i][j].first];
-
-        // Record the CCMA data structures.
-
-        ccmaAtoms.initialize<int2>(context, numCCMA, "CcmaAtoms");
-        ccmaAtomConstraints.initialize<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
-        ccmaNumAtomConstraints.initialize<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        ccmaConstraintMatrixColumn.initialize<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
-        ccmaConverged.initialize<int>(context, 2, "ccmaConverged");
+CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : IntegrationUtilities(context, system),
+        ccmaConvergedMemory(NULL) {
+        CHECK_RESULT2(cuEventCreate(&ccmaEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for CCMA");
        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
        CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
-        vector<int2> atomsVec(ccmaAtoms.getSize());
-        vector<int> atomConstraintsVec(ccmaAtomConstraints.getSize());
-        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize());
-        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize());
-        int elementSize = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
-        ccmaDistance.initialize(context, numCCMA, 4*elementSize, "CcmaDistance");
-        ccmaDelta1.initialize(context, numCCMA, elementSize, "CcmaDelta1");
-        ccmaDelta2.initialize(context, numCCMA, elementSize, "CcmaDelta2");
-        ccmaReducedMass.initialize(context, numCCMA, elementSize, "CcmaReducedMass");
-        ccmaConstraintMatrixValue.initialize(context, numCCMA*maxRowElements, elementSize, "ConstraintMatrixValue");
-        vector<double4> distanceVec(ccmaDistance.getSize());
-        vector<double> reducedMassVec(ccmaReducedMass.getSize());
-        vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
-        for (int i = 0; i < numCCMA; i++) {
-            int index = constraintOrder[i];
-            int c = ccmaConstraints[index];
-            atomsVec[i].x = atom1[c];
-            atomsVec[i].y = atom2[c];
-            distanceVec[i].w = distance[c];
-            reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
-            for (unsigned int j = 0; j < matrix[index].size(); j++) {
-                constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
-                constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
-            }
-            constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
-        }
-        ccmaDistance.upload(distanceVec, true);
-        ccmaReducedMass.upload(reducedMassVec, true);
-        ccmaConstraintMatrixValue.upload(constraintMatrixValueVec, true);
-        for (unsigned int i = 0; i < atomConstraints.size(); i++) {
-            numAtomConstraintsVec[i] = atomConstraints[i].size();
-            for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
-                bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
-                atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
-            }
-        }
-        ccmaAtoms.upload(atomsVec);
-        ccmaAtomConstraints.upload(atomConstraintsVec);
-        ccmaNumAtomConstraints.upload(numAtomConstraintsVec);
-        ccmaConstraintMatrixColumn.upload(constraintMatrixColumnVec);
-    }
-    
-    // Build the list of virtual sites.
-    
-    vector<int4> vsite2AvgAtomVec;
-    vector<double2> vsite2AvgWeightVec;
-    vector<int4> vsite3AvgAtomVec;
-    vector<double4> vsite3AvgWeightVec;
-    vector<int4> vsiteOutOfPlaneAtomVec;
-    vector<double4> vsiteOutOfPlaneWeightVec;
-    vector<int> vsiteLocalCoordsIndexVec;
-    vector<int> vsiteLocalCoordsAtomVec;
-    vector<int> vsiteLocalCoordsStartVec;
-    vector<double> vsiteLocalCoordsWeightVec;
-    vector<double4> vsiteLocalCoordsPosVec;
-    for (int i = 0; i < numAtoms; i++) {
-        if (system.isVirtualSite(i)) {
-            if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
-                // A two particle average.
-                
-                const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
-                vsite2AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), 0));
-                vsite2AvgWeightVec.push_back(make_double2(site.getWeight(0), site.getWeight(1)));
-            }
-            else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
-                // A three particle average.
-                
-                const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
-                vsite3AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
-                vsite3AvgWeightVec.push_back(make_double4(site.getWeight(0), site.getWeight(1), site.getWeight(2), 0.0));
-            }
-            else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
-                // An out of plane site.
-                
-                const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
-                vsiteOutOfPlaneAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
-                vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
-            }
-            else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) {
-                // A local coordinates site.
-                
-                const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i));
-                int numParticles = site.getNumParticles();
-                vector<double> origin, x, y;
-                site.getOriginWeights(origin);
-                site.getXWeights(x);
-                site.getYWeights(y);
-                vsiteLocalCoordsIndexVec.push_back(i);
-                vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
-                for (int j = 0; j < numParticles; j++) {
-                    vsiteLocalCoordsAtomVec.push_back(site.getParticle(j));
-                    vsiteLocalCoordsWeightVec.push_back(origin[j]);
-                    vsiteLocalCoordsWeightVec.push_back(x[j]);
-                    vsiteLocalCoordsWeightVec.push_back(y[j]);
-                }
-                Vec3 pos = site.getLocalPosition();
-                vsiteLocalCoordsPosVec.push_back(make_double4(pos[0], pos[1], pos[2], 0.0));
-            }
-        }
-    }
-    vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
-    int num2Avg = vsite2AvgAtomVec.size();
-    int num3Avg = vsite3AvgAtomVec.size();
-    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
-    int numLocalCoords = vsiteLocalCoordsPosVec.size();
-    vsite2AvgAtoms.initialize<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
-    vsite3AvgAtoms.initialize<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
-    vsiteOutOfPlaneAtoms.initialize<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
-    vsiteLocalCoordsIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
-    vsiteLocalCoordsAtoms.initialize<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
-    vsiteLocalCoordsStartIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
-    if (num2Avg > 0)
-        vsite2AvgAtoms.upload(vsite2AvgAtomVec);
-    if (num3Avg > 0)
-        vsite3AvgAtoms.upload(vsite3AvgAtomVec);
-    if (numOutOfPlane > 0)
-        vsiteOutOfPlaneAtoms.upload(vsiteOutOfPlaneAtomVec);
-    if (numLocalCoords > 0) {
-        vsiteLocalCoordsIndex.upload(vsiteLocalCoordsIndexVec);
-        vsiteLocalCoordsAtoms.upload(vsiteLocalCoordsAtomVec);
-        vsiteLocalCoordsStartIndex.upload(vsiteLocalCoordsStartVec);
-    }
-    int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-    vsite2AvgWeights.initialize(context, max(1, num2Avg), 2*elementSize, "vsite2AvgWeights");
-    vsite3AvgWeights.initialize(context, max(1, num3Avg), 4*elementSize, "vsite3AvgWeights");
-    vsiteOutOfPlaneWeights.initialize(context, max(1, numOutOfPlane), 4*elementSize, "vsiteOutOfPlaneWeights");
-    vsiteLocalCoordsWeights.initialize(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), elementSize, "vsiteLocalCoordsWeights");
-    vsiteLocalCoordsPos.initialize(context, max(1, (int) vsiteLocalCoordsPosVec.size()), 4*elementSize, "vsiteLocalCoordsPos");
-    if (num2Avg > 0)
-        vsite2AvgWeights.upload(vsite2AvgWeightVec, true);
-    if (num3Avg > 0)
-        vsite3AvgWeights.upload(vsite3AvgWeightVec, true);
-    if (numOutOfPlane > 0)
-        vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec, true);
-    if (numLocalCoords > 0) {
-        vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec, true);
-        vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec, true);
-    }
-
-    // Create the kernels used by this class.
-
-    map<string, string> defines;
-    defines["NUM_CCMA_CONSTRAINTS"] = context.intToString(numCCMA);
-    defines["NUM_ATOMS"] = context.intToString(numAtoms);
-    defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
-    defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
-    defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
-    defines["NUM_LOCAL_COORDS"] = context.intToString(numLocalCoords);
-    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
-    CUmodule module = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::integrationUtilities, defines);
-    settlePosKernel = context.getKernel(module, "applySettleToPositions");
-    settleVelKernel = context.getKernel(module, "applySettleToVelocities");
-    shakePosKernel = context.getKernel(module, "applyShakeToPositions");
-    shakeVelKernel = context.getKernel(module, "applyShakeToVelocities");
-    ccmaDirectionsKernel = context.getKernel(module, "computeCCMAConstraintDirections");
-    ccmaPosForceKernel = context.getKernel(module, "computeCCMAPositionConstraintForce");
-    ccmaVelForceKernel = context.getKernel(module, "computeCCMAVelocityConstraintForce");
-    ccmaMultiplyKernel = context.getKernel(module, "multiplyByCCMAConstraintMatrix");
-    ccmaUpdateKernel = context.getKernel(module, "updateCCMAAtomPositions");
-    CHECK_RESULT2(cuEventCreate(&ccmaEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for CCMA");
-    vsitePositionKernel = context.getKernel(module, "computeVirtualSites");
-    vsiteForceKernel = context.getKernel(module, "distributeVirtualSiteForces");
-    numVsites = num2Avg+num3Avg+numOutOfPlane+numLocalCoords;
-    randomKernel = context.getKernel(module, "generateRandomNumbers");
-    timeShiftKernel = context.getKernel(module, "timeShiftVelocities");
 }

 CudaIntegrationUtilities::~CudaIntegrationUtilities() {
    context.setAsCurrent();
-    if (ccmaConvergedMemory != NULL)
+    if (ccmaConvergedMemory != NULL) {
        cuMemFreeHost(ccmaConvergedMemory);
-}
-
-void CudaIntegrationUtilities::setNextStepSize(double size) {
-    if (size != lastStepSize.x || size != lastStepSize.y) {
-        lastStepSize = make_double2(size, size);
-        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-            stepSize.upload(&lastStepSize);
-        else {
-            float2 lastStepSizeFloat = make_float2((float) size, (float) size);
-            stepSize.upload(&lastStepSizeFloat);
-        }
+        cuEventDestroy(ccmaEvent);
    }
 }

-double CudaIntegrationUtilities::getLastStepSize() {
-    if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-        stepSize.download(&lastStepSize);
-    else {
-        float2 lastStepSizeFloat;
-        stepSize.download(&lastStepSizeFloat);
-        lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
-    }
-    return lastStepSize.y;
+CudaArray& CudaIntegrationUtilities::getPosDelta() {
+    return dynamic_cast<CudaContext&>(context).unwrap(posDelta);
 }

-void CudaIntegrationUtilities::applyConstraints(double tol) {
-    applyConstraints(false, tol);
+CudaArray& CudaIntegrationUtilities::getRandom() {
+    return dynamic_cast<CudaContext&>(context).unwrap(random);
 }

-void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
-    applyConstraints(true, tol);
+CudaArray& CudaIntegrationUtilities::getStepSize() {
+    return dynamic_cast<CudaContext&>(context).unwrap(stepSize);
 }

-void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
-    CUfunction settleKernel, shakeKernel, ccmaForceKernel;
+void CudaIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, double tol) {
+    ComputeKernel settleKernel, shakeKernel, ccmaForceKernel;
    if (constrainVelocities) {
        settleKernel = settleVelKernel;
        shakeKernel = shakeVelKernel;
@@ -588,45 +77,39 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
        shakeKernel = shakePosKernel;
        ccmaForceKernel = ccmaPosForceKernel;
    }
-    float floatTol = (float) tol;
-    void* tolPointer = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? (void*) &tol : (void*) &floatTol);
-    CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
    if (settleAtoms.isInitialized()) {
-        int numClusters = settleAtoms.getSize();
-        void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
-                &posDelta.getDevicePointer(), &context.getVelm().getDevicePointer(),
-                &settleAtoms.getDevicePointer(), &settleParams.getDevicePointer()};
-        context.executeKernel(settleKernel, args, settleAtoms.getSize());
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            settleKernel->setArg(1, tol);
+        else
+            settleKernel->setArg(1, (float) tol);
+        settleKernel->execute(settleAtoms.getSize());
    }
    if (shakeAtoms.isInitialized()) {
-        int numClusters = shakeAtoms.getSize();
-        void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
-                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
-                &shakeAtoms.getDevicePointer(), &shakeParams.getDevicePointer()};
-        context.executeKernel(shakeKernel, args, shakeAtoms.getSize());
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            shakeKernel->setArg(1, tol);
+        else
+            shakeKernel->setArg(1, (float) tol);
+        shakeKernel->execute(shakeAtoms.getSize());
    }
    if (ccmaAtoms.isInitialized()) {
-        void* directionsArgs[] = {&ccmaAtoms.getDevicePointer(), &ccmaDistance.getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged.getDevicePointer()};
-        context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms.getSize());
-        int i;
-        void* forceArgs[] = {&ccmaAtoms.getDevicePointer(), &ccmaDistance.getDevicePointer(),
-                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
-                &ccmaReducedMass.getDevicePointer(), &ccmaDelta1.getDevicePointer(), &ccmaConverged.getDevicePointer(),
-                &ccmaConvergedDeviceMemory, tolPointer, &i};
-        void* multiplyArgs[] = {&ccmaDelta1.getDevicePointer(), &ccmaDelta2.getDevicePointer(),
-                &ccmaConstraintMatrixColumn.getDevicePointer(), &ccmaConstraintMatrixValue.getDevicePointer(), &ccmaConverged.getDevicePointer(), &i};
-        void* updateArgs[] = {&ccmaNumAtomConstraints.getDevicePointer(), &ccmaAtomConstraints.getDevicePointer(), &ccmaDistance.getDevicePointer(),
-                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
-                &context.getVelm().getDevicePointer(), &ccmaDelta1.getDevicePointer(), &ccmaDelta2.getDevicePointer(),
-                &ccmaConverged.getDevicePointer(), &i};
+        ccmaForceKernel->setArg(6, ccmaConvergedDeviceMemory);
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            ccmaForceKernel->setArg(7, tol);
+        else
+            ccmaForceKernel->setArg(7, (float) tol);
+        ccmaDirectionsKernel->execute(ccmaAtoms.getSize());
        const int checkInterval = 4;
        ccmaConvergedMemory[0] = 0;
-        for (i = 0; i < 150; i++) {
-            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms.getSize());
+        ccmaUpdateKernel->setArg(3, constrainVelocities ? context.getVelm() : posDelta);
+        for (int i = 0; i < 150; i++) {
+            ccmaForceKernel->setArg(8, i);
+            ccmaForceKernel->execute(ccmaAtoms.getSize());
            if ((i+1)%checkInterval == 0)
                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
-            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms.getSize());
-            context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
+            ccmaMultiplyKernel->setArg(5, i);
+            ccmaMultiplyKernel->execute(ccmaAtoms.getSize());
+            ccmaUpdateKernel->setArg(8, i);
+            ccmaUpdateKernel->execute(context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
                if (ccmaConvergedMemory[0])
@@ -636,142 +119,9 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
    }
 }

-void CudaIntegrationUtilities::computeVirtualSites() {
-    if (numVsites > 0) {
-        CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
-        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms.getDevicePointer(), &vsite2AvgWeights.getDevicePointer(),
-                &vsite3AvgAtoms.getDevicePointer(), &vsite3AvgWeights.getDevicePointer(),
-                &vsiteOutOfPlaneAtoms.getDevicePointer(), &vsiteOutOfPlaneWeights.getDevicePointer(),
-                &vsiteLocalCoordsIndex.getDevicePointer(), &vsiteLocalCoordsAtoms.getDevicePointer(),
-                &vsiteLocalCoordsWeights.getDevicePointer(), &vsiteLocalCoordsPos.getDevicePointer(),
-                &vsiteLocalCoordsStartIndex.getDevicePointer()};
-        context.executeKernel(vsitePositionKernel, args, numVsites);
-    }
-}
-
 void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
    if (numVsites > 0) {
-        CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
-        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &context.getForce().getDevicePointer(),
-                &vsite2AvgAtoms.getDevicePointer(), &vsite2AvgWeights.getDevicePointer(),
-                &vsite3AvgAtoms.getDevicePointer(), &vsite3AvgWeights.getDevicePointer(),
-                &vsiteOutOfPlaneAtoms.getDevicePointer(), &vsiteOutOfPlaneWeights.getDevicePointer(),
-                &vsiteLocalCoordsIndex.getDevicePointer(), &vsiteLocalCoordsAtoms.getDevicePointer(),
-                &vsiteLocalCoordsWeights.getDevicePointer(), &vsiteLocalCoordsPos.getDevicePointer(),
-                &vsiteLocalCoordsStartIndex.getDevicePointer()};
-        context.executeKernel(vsiteForceKernel, args, numVsites);
-    }
-}
-
-void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
-    if (random.isInitialized()) {
-        if (randomNumberSeed != lastSeed)
-           throw OpenMMException("CudaIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
-        return;
-    }
-
-    // Create the random number arrays.
-
-    lastSeed = randomNumberSeed;
-    random.initialize<float4>(context, 4*context.getPaddedNumAtoms(), "random");
-    randomSeed.initialize<int4>(context, context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
-    randomPos = random.getSize();
-
-    // Use a quick and dirty RNG to pick seeds for the real random number generator.
-
-    vector<int4> seed(randomSeed.getSize());
-    unsigned int r = randomNumberSeed;
-    if (r == 0) r = (unsigned int) osrngseed();
-    for (int i = 0; i < randomSeed.getSize(); i++) {
-        seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-        seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-        seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-        seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-    }
-    randomSeed.upload(seed);
-}
-
-int CudaIntegrationUtilities::prepareRandomNumbers(int numValues) {
-    if (randomPos+numValues <= random.getSize()) {
-        int oldPos = randomPos;
-        randomPos += numValues;
-        return oldPos;
-    }
-    if (numValues > random.getSize())
-        random.resize(numValues);
-    int size = random.getSize();
-    void* args[] = {&size, &random.getDevicePointer(), &randomSeed.getDevicePointer()};
-    context.executeKernel(randomKernel, args, random.getSize());
-    randomPos = numValues;
-    return 0;
-}
-
-void CudaIntegrationUtilities::createCheckpoint(ostream& stream) {
-    if (!random.isInitialized()) 
-        return;
-    stream.write((char*) &randomPos, sizeof(int));
-    vector<float4> randomVec;
-    random.download(randomVec);
-    stream.write((char*) &randomVec[0], sizeof(float4)*random.getSize());
-    vector<int4> randomSeedVec;
-    randomSeed.download(randomSeedVec);
-    stream.write((char*) &randomSeedVec[0], sizeof(int4)*randomSeed.getSize());
-}
-
-void CudaIntegrationUtilities::loadCheckpoint(istream& stream) {
-    if (!random.isInitialized()) 
-        return;
-    stream.read((char*) &randomPos, sizeof(int));
-    vector<float4> randomVec(random.getSize());
-    stream.read((char*) &randomVec[0], sizeof(float4)*random.getSize());
-    random.upload(randomVec);
-    vector<int4> randomSeedVec(randomSeed.getSize());
-    stream.read((char*) &randomSeedVec[0], sizeof(int4)*randomSeed.getSize());
-    randomSeed.upload(randomSeedVec);
-}
-
-double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
-    int numParticles = context.getNumAtoms();
-    if (timeShift != 0) {
-        float timeShiftFloat = (float) timeShift;
-        void* timeShiftPtr = (context.getUseDoublePrecision() ? (void*) &timeShift : (void*) &timeShiftFloat);
-
-        // Copy the velocities into the posDelta array while we temporarily modify them.
-
-        context.getVelm().copyTo(posDelta);
-
-        // Apply the time shift.
-
-        void* args[] = {&context.getVelm().getDevicePointer(), &context.getForce().getDevicePointer(), timeShiftPtr};
-        context.executeKernel(timeShiftKernel, args, numParticles);
-        applyConstraints(true, 1e-4);
-    }
-    
-    // Compute the kinetic energy.
-    
-    double energy = 0.0;
-    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
-        vector<double4> velm;
-        context.getVelm().download(velm);
-        for (int i = 0; i < numParticles; i++) {
-            double4 v = velm[i];
-            if (v.w != 0)
-                energy += (v.x*v.x+v.y*v.y+v.z*v.z)/v.w;
-        }
-    }
-    else {
-        vector<float4> velm;
-        context.getVelm().download(velm);
-        for (int i = 0; i < numParticles; i++) {
-            float4 v = velm[i];
-            if (v.w != 0)
-                energy += (v.x*v.x+v.y*v.y+v.z*v.z)/v.w;
-        }
+        vsiteForceKernel->setArg(2, context.getLongForceBuffer());
+        vsiteForceKernel->execute(numVsites);
    }
-    
-    // Restore the velocities.
-    
-    if (timeShift != 0)
-        posDelta.copyTo(context.getVelm());
-    return 0.5*energy;
 }
--- a/platforms/cuda/src/CudaKernel.cpp
+++ b/platforms/cuda/src/CudaKernel.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaKernel.h"
+#include "openmm/common/ComputeArray.h"
+#include <cstring>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+CudaKernel::CudaKernel(CudaContext& context, CUfunction kernel, const string& name) : context(context), kernel(kernel), name(name) {
+}
+
+string CudaKernel::getName() const {
+    return name;
+}
+
+void CudaKernel::execute(int threads, int blockSize) {
+    int numArgs = arrayArgs.size();
+    argPointers.resize(numArgs);
+    for (int i = 0; i < numArgs; i++) {
+        if (arrayArgs[i] != NULL)
+            argPointers[i] = &arrayArgs[i]->getDevicePointer();
+        else
+            argPointers[i] = &primitiveArgs[i];
+    }
+    context.executeKernel(kernel, argPointers.data(), threads, blockSize);
+}
+
+void CudaKernel::addArrayArg(ArrayInterface& value) {
+    int index = arrayArgs.size();
+    addEmptyArg();
+    setArrayArg(index, value);
+}
+
+void CudaKernel::addPrimitiveArg(const void* value, int size) {
+    int index = arrayArgs.size();
+    addEmptyArg();
+    setPrimitiveArg(index, value, size);
+}
+
+void CudaKernel::addEmptyArg() {
+    primitiveArgs.push_back(make_double4(0, 0, 0, 0));
+    arrayArgs.push_back(NULL);
+}
+
+void CudaKernel::setArrayArg(int index, ArrayInterface& value) {
+    arrayArgs[index] = &context.unwrap(value);
+}
+
+void CudaKernel::setPrimitiveArg(int index, const void* value, int size) {
+    if (size > sizeof(double4))
+        throw OpenMMException("Unsupported value type for kernel argument");
+    memcpy(&primitiveArgs[index], value, size);
+    arrayArgs[index] = NULL;
+}
--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -28,6 +28,7 @@
 #include "CudaKernels.h"
 #include "CudaParallelKernels.h"
 #include "CudaPlatform.h"
+#include "openmm/common/CommonKernels.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/OpenMMException.h"

@@ -77,64 +78,68 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
    if (name == VirtualSitesKernel::Name())
        return new CudaVirtualSitesKernel(name, platform, cu);
    if (name == CalcHarmonicBondForceKernel::Name())
-        return new CudaCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomBondForceKernel::Name())
-        return new CudaCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcHarmonicAngleForceKernel::Name())
-        return new CudaCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomAngleForceKernel::Name())
-        return new CudaCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcPeriodicTorsionForceKernel::Name())
-        return new CudaCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcRBTorsionForceKernel::Name())
-        return new CudaCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCMAPTorsionForceKernel::Name())
-        return new CudaCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomTorsionForceKernel::Name())
-        return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcNonbondedForceKernel::Name())
        return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomNonbondedForceKernel::Name())
-        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcGBSAOBCForceKernel::Name())
-        return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
+        return new CommonCalcGBSAOBCForceKernel(name, platform, cu);
    if (name == CalcCustomGBForceKernel::Name())
-        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomExternalForceKernel::Name())
-        return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomHbondForceKernel::Name())
-        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCentroidBondForceKernel::Name())
-        return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCompoundBondForceKernel::Name())
-        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCVForceKernel::Name())
        return new CudaCalcCustomCVForceKernel(name, platform, cu);
    if (name == CalcRMSDForceKernel::Name())
-        return new CudaCalcRMSDForceKernel(name, platform, cu);
+        return new CommonCalcRMSDForceKernel(name, platform, cu);
    if (name == CalcCustomManyParticleForceKernel::Name())
-        return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
+        return new CommonCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcGayBerneForceKernel::Name())
-        return new CudaCalcGayBerneForceKernel(name, platform, cu);
+        return new CommonCalcGayBerneForceKernel(name, platform, cu);
    if (name == IntegrateVerletStepKernel::Name())
-        return new CudaIntegrateVerletStepKernel(name, platform, cu);
+        return new CommonIntegrateVerletStepKernel(name, platform, cu);
    if (name == IntegrateLangevinStepKernel::Name())
-        return new CudaIntegrateLangevinStepKernel(name, platform, cu);
-    if (name == IntegrateBAOABStepKernel::Name())
-        return new CudaIntegrateBAOABStepKernel(name, platform, cu);
+        return new CommonIntegrateLangevinStepKernel(name, platform, cu);
+    if (name == IntegrateLangevinMiddleStepKernel::Name())
+        return new CommonIntegrateLangevinMiddleStepKernel(name, platform, cu);
    if (name == IntegrateBrownianStepKernel::Name())
-        return new CudaIntegrateBrownianStepKernel(name, platform, cu);
+        return new CommonIntegrateBrownianStepKernel(name, platform, cu);
    if (name == IntegrateVariableVerletStepKernel::Name())
-        return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
+        return new CommonIntegrateVariableVerletStepKernel(name, platform, cu);
    if (name == IntegrateVariableLangevinStepKernel::Name())
-        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
+        return new CommonIntegrateVariableLangevinStepKernel(name, platform, cu);
    if (name == IntegrateCustomStepKernel::Name())
-        return new CudaIntegrateCustomStepKernel(name, platform, cu);
+        return new CommonIntegrateCustomStepKernel(name, platform, cu);
    if (name == ApplyAndersenThermostatKernel::Name())
-        return new CudaApplyAndersenThermostatKernel(name, platform, cu);
+        return new CommonApplyAndersenThermostatKernel(name, platform, cu);
+    if (name == NoseHooverChainKernel::Name())
+        return new CudaNoseHooverChainKernel(name, platform, cu);
+    if (name == IntegrateVelocityVerletStepKernel::Name())
+        return new CudaIntegrateVelocityVerletStepKernel(name, platform, cu);
    if (name == ApplyMonteCarloBarostatKernel::Name())
        return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
    if (name == RemoveCMMotionKernel::Name())
-        return new CudaRemoveCMMotionKernel(name, platform, cu);
+        return new CommonRemoveCMMotionKernel(name, platform, cu);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
 }
--- a/platforms/cuda/src/CudaKernelSources.h.in
+++ b/platforms/cuda/src/CudaKernelSources.h.in
@@ -27,7 +27,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "windowsExportCuda.h"
+#include "openmm/common/windowsExportCommon.h"
 #include <string>

 namespace OpenMM {
@@ -38,9 +38,9 @@ namespace OpenMM {
 * kernels subfolder.
 */

-class OPENMM_EXPORT_CUDA CudaKernelSources {
+class OPENMM_EXPORT_COMMON CudaKernelSources {
 public:
-@CUDA_FILE_DECLARATIONS@
+@KERNEL_FILE_DECLARATIONS@
 };

 } // namespace OpenMM

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -27,6 +27,7 @@
 #include "openmm/OpenMMException.h"
 #include "CudaNonbondedUtilities.h"
 #include "CudaArray.h"
+#include "CudaContext.h"
 #include "CudaKernelSources.h"
 #include "CudaExpressionUtilities.h"
 #include "CudaSort.h"
@@ -84,6 +85,10 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
    cuEventDestroy(downloadCountEvent);
 }

+void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
+    addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false);
+}
+
 void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
    if (groupCutoff.size() > 0) {
        if (usesCutoff != useCutoff)
@@ -110,10 +115,20 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
    }
 }

+void CudaNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
+    parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
+            parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer()));
+}
+
 void CudaNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
    parameters.push_back(parameter);
 }

+void CudaNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
+    arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
+            parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer()));
+}
+
 void CudaNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
    arguments.push_back(parameter);
 }

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -200,7 +200,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        data.contextEnergy[i] = 0.0;
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
    }
 }
@@ -208,7 +208,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
 double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, contextForces, valid, interactionCounts[i]));
    }
    data.syncContexts();
@@ -255,7 +255,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con

 class CudaParallelCalcHarmonicBondForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcHarmonicBondForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcHarmonicBondForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -264,7 +264,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcHarmonicBondForceKernel& kernel;
+    CommonCalcHarmonicBondForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -272,7 +272,7 @@ private:
 CudaParallelCalcHarmonicBondForceKernel::CudaParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcHarmonicBondForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
@@ -283,7 +283,7 @@ void CudaParallelCalcHarmonicBondForceKernel::initialize(const System& system, c
 double CudaParallelCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -296,7 +296,7 @@ void CudaParallelCalcHarmonicBondForceKernel::copyParametersToContext(ContextImp

 class CudaParallelCalcCustomBondForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcCustomBondForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcCustomBondForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -305,7 +305,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcCustomBondForceKernel& kernel;
+    CommonCalcCustomBondForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -313,7 +313,7 @@ private:
 CudaParallelCalcCustomBondForceKernel::CudaParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcCustomBondForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
@@ -324,7 +324,7 @@ void CudaParallelCalcCustomBondForceKernel::initialize(const System& system, con
 double CudaParallelCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -337,7 +337,7 @@ void CudaParallelCalcCustomBondForceKernel::copyParametersToContext(ContextImpl&

 class CudaParallelCalcHarmonicAngleForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcHarmonicAngleForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcHarmonicAngleForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -346,7 +346,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcHarmonicAngleForceKernel& kernel;
+    CommonCalcHarmonicAngleForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -354,7 +354,7 @@ private:
 CudaParallelCalcHarmonicAngleForceKernel::CudaParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcHarmonicAngleForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
@@ -365,7 +365,7 @@ void CudaParallelCalcHarmonicAngleForceKernel::initialize(const System& system,
 double CudaParallelCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -378,7 +378,7 @@ void CudaParallelCalcHarmonicAngleForceKernel::copyParametersToContext(ContextIm

 class CudaParallelCalcCustomAngleForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcCustomAngleForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcCustomAngleForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -387,7 +387,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcCustomAngleForceKernel& kernel;
+    CommonCalcCustomAngleForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -395,7 +395,7 @@ private:
 CudaParallelCalcCustomAngleForceKernel::CudaParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcCustomAngleForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
@@ -406,7 +406,7 @@ void CudaParallelCalcCustomAngleForceKernel::initialize(const System& system, co
 double CudaParallelCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -419,7 +419,7 @@ void CudaParallelCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl

 class CudaParallelCalcPeriodicTorsionForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -428,7 +428,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcPeriodicTorsionForceKernel& kernel;
+    CommonCalcPeriodicTorsionForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -436,7 +436,7 @@ private:
 CudaParallelCalcPeriodicTorsionForceKernel::CudaParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcPeriodicTorsionForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
@@ -447,7 +447,7 @@ void CudaParallelCalcPeriodicTorsionForceKernel::initialize(const System& system
 double CudaParallelCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -460,7 +460,7 @@ void CudaParallelCalcPeriodicTorsionForceKernel::copyParametersToContext(Context

 class CudaParallelCalcRBTorsionForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcRBTorsionForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcRBTorsionForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -469,7 +469,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcRBTorsionForceKernel& kernel;
+    CommonCalcRBTorsionForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -477,7 +477,7 @@ private:
 CudaParallelCalcRBTorsionForceKernel::CudaParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcRBTorsionForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
@@ -488,7 +488,7 @@ void CudaParallelCalcRBTorsionForceKernel::initialize(const System& system, cons
 double CudaParallelCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -501,7 +501,7 @@ void CudaParallelCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl&

 class CudaParallelCalcCMAPTorsionForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcCMAPTorsionForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcCMAPTorsionForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -510,7 +510,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcCMAPTorsionForceKernel& kernel;
+    CommonCalcCMAPTorsionForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -518,7 +518,7 @@ private:
 CudaParallelCalcCMAPTorsionForceKernel::CudaParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcCMAPTorsionForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
@@ -529,7 +529,7 @@ void CudaParallelCalcCMAPTorsionForceKernel::initialize(const System& system, co
 double CudaParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -542,7 +542,7 @@ void CudaParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl

 class CudaParallelCalcCustomTorsionForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcCustomTorsionForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcCustomTorsionForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -551,7 +551,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcCustomTorsionForceKernel& kernel;
+    CommonCalcCustomTorsionForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -559,7 +559,7 @@ private:
 CudaParallelCalcCustomTorsionForceKernel::CudaParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcCustomTorsionForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
@@ -570,7 +570,7 @@ void CudaParallelCalcCustomTorsionForceKernel::initialize(const System& system,
 double CudaParallelCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -611,7 +611,7 @@ void CudaParallelCalcNonbondedForceKernel::initialize(const System& system, cons
 double CudaParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
    }
    return 0.0;
@@ -632,7 +632,7 @@ void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int

 class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcCustomNonbondedForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -641,7 +641,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcCustomNonbondedForceKernel& kernel;
+    CommonCalcCustomNonbondedForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -649,7 +649,7 @@ private:
 CudaParallelCalcCustomNonbondedForceKernel::CudaParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcCustomNonbondedForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
@@ -660,7 +660,7 @@ void CudaParallelCalcCustomNonbondedForceKernel::initialize(const System& system
 double CudaParallelCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -673,7 +673,7 @@ void CudaParallelCalcCustomNonbondedForceKernel::copyParametersToContext(Context

 class CudaParallelCalcCustomExternalForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcCustomExternalForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcCustomExternalForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -682,7 +682,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcCustomExternalForceKernel& kernel;
+    CommonCalcCustomExternalForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -690,7 +690,7 @@ private:
 CudaParallelCalcCustomExternalForceKernel::CudaParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcCustomExternalForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
@@ -701,7 +701,7 @@ void CudaParallelCalcCustomExternalForceKernel::initialize(const System& system,
 double CudaParallelCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -714,7 +714,7 @@ void CudaParallelCalcCustomExternalForceKernel::copyParametersToContext(ContextI

 class CudaParallelCalcCustomHbondForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcCustomHbondForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcCustomHbondForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -723,7 +723,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcCustomHbondForceKernel& kernel;
+    CommonCalcCustomHbondForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -731,7 +731,7 @@ private:
 CudaParallelCalcCustomHbondForceKernel::CudaParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcCustomHbondForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
@@ -742,7 +742,7 @@ void CudaParallelCalcCustomHbondForceKernel::initialize(const System& system, co
 double CudaParallelCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
@@ -755,7 +755,7 @@ void CudaParallelCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl

 class CudaParallelCalcCustomCompoundBondForceKernel::Task : public CudaContext::WorkTask {
 public:
-    Task(ContextImpl& context, CudaCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
+    Task(ContextImpl& context, CommonCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
@@ -764,7 +764,7 @@ public:
    }
 private:
    ContextImpl& context;
-    CudaCalcCustomCompoundBondForceKernel& kernel;
+    CommonCalcCustomCompoundBondForceKernel& kernel;
    bool includeForce, includeEnergy;
    double& energy;
 };
@@ -772,7 +772,7 @@ private:
 CudaParallelCalcCustomCompoundBondForceKernel::CudaParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
        CalcCustomCompoundBondForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
-        kernels.push_back(Kernel(new CudaCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
+        kernels.push_back(Kernel(new CommonCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
 }

 void CudaParallelCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
@@ -783,7 +783,7 @@ void CudaParallelCalcCustomCompoundBondForceKernel::initialize(const System& sys
 double CudaParallelCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
-        CudaContext::WorkThread& thread = cu.getWorkThread();
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;

--- a/platforms/cuda/src/CudaParameterSet.cpp
+++ b/platforms/cuda/src/CudaParameterSet.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -25,174 +25,12 @@
 * -------------------------------------------------------------------------- */

 #include "CudaParameterSet.h"
-#include "openmm/OpenMMException.h"
-#include <cmath>
-#include <sstream>

 using namespace OpenMM;
 using namespace std;

-#define CHECK_RESULT(result) \
-    if (result != CUDA_SUCCESS) { \
-        std::stringstream m; \
-        m<<errorMessage<<": "<<context.getErrorString(result)<<" ("<<result<<")"; \
-        throw OpenMMException(m.str());\
-    }
-
 CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
-            context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
-    int params = numParameters;
-    int bufferCount = 0;
-    elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
-    string elementType = (useDoublePrecision ? "double" : "float");
-    CUdeviceptr pointer;
-    string errorMessage = "Error creating parameter set "+name;
-    if (!bufferPerParameter) {
-        while (params > 2) {
-            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*4));
-            std::stringstream name;
-            name << "param" << (++bufferCount);
-            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, pointer));
-            params -= 4;
-        }
-        if (params > 1) {
-            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*2));
-            std::stringstream name;
-            name << "param" << (++bufferCount);
-            buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, pointer));
-            params -= 2;
-        }
-    }
-    while (params > 0) {
-            CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize));
-        std::stringstream name;
-        name << "param" << (++bufferCount);
-        buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, pointer));
-        params--;
-    }
-}
-
-CudaParameterSet::~CudaParameterSet() {
-    if (context.getContextIsValid()) {
-        string errorMessage = "Error freeing device memory";
-        for (int i = 0; i < (int) buffers.size(); i++)
-            CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
-    }
-}
-
-template <class T>
-void CudaParameterSet::getParameterValues(vector<vector<T> >& values) {
-    if (sizeof(T) != elementSize)
-        throw OpenMMException("Called getParameterValues() with vector of wrong type");
-    values.resize(numObjects);
-    for (int i = 0; i < numObjects; i++)
-        values[i].resize(numParameters);
-    int base = 0;
-    string errorMessage = "Error downloading parameter set "+name;
-    for (int i = 0; i < (int) buffers.size(); i++) {
-        if (buffers[i].getSize() == 4*elementSize) {
-            vector<T> data(4*numObjects);
-            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
-            for (int j = 0; j < numObjects; j++) {
-                values[j][base] = data[4*j];
-                if (base+1 < numParameters)
-                    values[j][base+1] = data[4*j+1];
-                if (base+2 < numParameters)
-                    values[j][base+2] = data[4*j+2];
-                if (base+3 < numParameters)
-                    values[j][base+3] = data[4*j+3];
-            }
-            base += 4;
-        }
-        else if (buffers[i].getSize() == 2*elementSize) {
-            vector<T> data(2*numObjects);
-            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
-            for (int j = 0; j < numObjects; j++) {
-                values[j][base] = data[2*j];
-                if (base+1 < numParameters)
-                    values[j][base+1] = data[2*j+1];
-            }
-            base += 2;
-        }
-        else if (buffers[i].getSize() == elementSize) {
-            vector<T> data(numObjects);
-            CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
-            for (int j = 0; j < numObjects; j++)
-                values[j][base] = data[j];
-            base++;
-        }
-        else
-            throw OpenMMException("Internal error: Unknown buffer type in CudaParameterSet");
-    }
+            ComputeParameterSet(context, numParameters, numObjects, name, bufferPerParameter, useDoublePrecision) {
+    for (auto& info : getParameterInfos())
+        buffers.push_back(CudaNonbondedUtilities::ParameterInfo(info.getName(), info.getComponentType(), info.getNumComponents(), info.getSize(), context.unwrap(info.getArray()).getDevicePointer()));
 }
-
-template <class T>
-void CudaParameterSet::setParameterValues(const vector<vector<T> >& values) {
-    if (sizeof(T) != elementSize)
-        throw OpenMMException("Called setParameterValues() with vector of wrong type");
-    int base = 0;
-    string errorMessage = "Error uploading parameter set "+name;
-    for (int i = 0; i < (int) buffers.size(); i++) {
-        if (buffers[i].getSize() == 4*elementSize) {
-            vector<T> data(4*numObjects);
-            for (int j = 0; j < numObjects; j++) {
-                data[4*j] = values[j][base];
-                if (base+1 < numParameters)
-                    data[4*j+1] = values[j][base+1];
-                if (base+2 < numParameters)
-                    data[4*j+2] = values[j][base+2];
-                if (base+3 < numParameters)
-                    data[4*j+3] = values[j][base+3];
-            }
-            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
-            base += 4;
-        }
-        else if (buffers[i].getSize() == 2*elementSize) {
-            vector<T> data(2*numObjects);
-            for (int j = 0; j < numObjects; j++) {
-                data[2*j] = values[j][base];
-                if (base+1 < numParameters)
-                    data[2*j+1] = values[j][base+1];
-            }
-            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
-            base += 2;
-        }
-        else if (buffers[i].getSize() == elementSize) {
-            vector<T> data(numObjects);
-            for (int j = 0; j < numObjects; j++)
-                data[j] = values[j][base];
-            CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
-            base++;
-        }
-        else
-            throw OpenMMException("Internal error: Unknown buffer type in CudaParameterSet");
-    }
-}
-
-string CudaParameterSet::getParameterSuffix(int index, const std::string& extraSuffix) const {
-    const string suffixes[] = {".x", ".y", ".z", ".w"};
-    int buffer = -1;
-    for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
-        if (index*elementSize < buffers[i].getSize())
-            buffer = i;
-        else
-            index -= buffers[i].getSize()/elementSize;
-    }
-    if (buffer == -1)
-        throw OpenMMException("Internal error: Illegal argument to CudaParameterSet::getParameterSuffix() ("+name+")");
-    stringstream suffix;
-    suffix << (buffer+1) << extraSuffix;
-    if (buffers[buffer].getSize() != elementSize)
-        suffix << suffixes[index];
-    return suffix.str();
-}
-
-/**
- * Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
- */
-namespace OpenMM {
-template OPENMM_EXPORT_CUDA void CudaParameterSet::getParameterValues<float>(vector<vector<float> >& values);
-template OPENMM_EXPORT_CUDA void CudaParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
-template OPENMM_EXPORT_CUDA void CudaParameterSet::getParameterValues<double>(vector<vector<double> >& values);
-template OPENMM_EXPORT_CUDA void CudaParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
-}
\ No newline at end of file
--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -51,12 +51,12 @@ using namespace std;
    }


-#ifdef OPENMM_CUDA_BUILDING_STATIC_LIBRARY
+#ifdef OPENMM_COMMON_BUILDING_STATIC_LIBRARY
 extern "C" void registerCudaPlatform() {
    Platform::registerPlatform(new CudaPlatform());
 }
 #else
-extern "C" OPENMM_EXPORT_CUDA void registerPlatforms() {
+extern "C" OPENMM_EXPORT_COMMON void registerPlatforms() {
    Platform::registerPlatform(new CudaPlatform());
 }
 #endif
@@ -96,13 +96,15 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVelocityVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateBAOABStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
+    registerKernelFactory(NoseHooverChainKernel::Name(), factory);
    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
    platformProperties.push_back(CudaDeviceIndex());

--- a/platforms/cuda/src/CudaForceInfo.cpp
+++ b/platforms/cuda/src/CudaForceInfo.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -24,23 +24,16 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "CudaForceInfo.h"
+#include "CudaProgram.h"
+#include "CudaKernel.h"

 using namespace OpenMM;
 using namespace std;

-bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
-    return true;
+CudaProgram::CudaProgram(CudaContext& context, CUmodule module) : context(context), module(module) {
 }

-int CudaForceInfo::getNumParticleGroups() {
-    return 0;
-}
-
-void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
-    return;
-}
-
-bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
-    return true;
-}
+ComputeKernel CudaProgram::createKernel(const string& name) {
+    CUfunction kernel = context.getKernel(module, name.c_str());
+    return shared_ptr<ComputeKernelImpl>(new CudaKernel(context, kernel, name));
+}
\ No newline at end of file
--- a/platforms/cuda/src/kernels/common.cu
+++ b/platforms/cuda/src/kernels/common.cu
+/**
+ * This file contains CUDA definitions for the macros and functions needed for the
+ * common compute framework.
+ */
+
+#define KERNEL extern "C" __global__
+#define DEVICE __device__
+#define LOCAL __shared__
+#define LOCAL_ARG
+#define GLOBAL
+#define RESTRICT __restrict__
+#define LOCAL_ID threadIdx.x
+#define LOCAL_SIZE blockDim.x
+#define GLOBAL_ID (blockIdx.x*blockDim.x+threadIdx.x)
+#define GLOBAL_SIZE (blockDim.x*gridDim.x)
+#define GROUP_ID blockIdx.x
+#define NUM_GROUPS gridDim.x
+#define SYNC_THREADS __syncthreads();
+#define MEM_FENCE __threadfence_block();
+#define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
+
+typedef long long mm_long;
+typedef unsigned long long mm_ulong;
+
+#define SUPPORTS_64_BIT_ATOMICS 1
+#define SUPPORTS_DOUBLE_PRECISION 1