Converted RPMD plugin to common platform (#3079)

* Converted RPMD plugin to common platform * Merged RPMD tests for different platforms * Try to fix errors on CPU OpenCL

Converted RPMD plugin to common platform (#3079)
* Converted RPMD plugin to common platform * Merged RPMD tests for different platforms * Try to fix errors on CPU OpenCL
be19e022 · Peter Eastman · GitHub · 98d81730 · be19e022 · be19e022
Unverified Commit be19e022 authored Mar 22, 2021 by Peter Eastman Committed by GitHub Mar 22, 2021
20 changed files
--- a/platforms/common/include/openmm/common/ArrayInterface.h
+++ b/platforms/common/include/openmm/common/ArrayInterface.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2019-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -140,6 +140,17 @@ public:
     *                 in page-locked memory.
     */
    virtual void upload(const void* data, bool blocking=true) = 0;
+    /**
+     * Copy values from host memory to a subset of the array.
+     * 
+     * @param data     the data to copy
+     * @param offset   the index of the element within the array at which the copy should begin
+     * @param elements the number of elements to copy
+     * @param blocking if true, this call will block until the transfer is complete.  Subclasses often
+     *                 have restrictions on non-blocking copies, such as that the source data must be
+     *                 in page-locked memory.
+     */
+    virtual void uploadSubArray(const void* data, int offset, int elements, bool blocking=true) = 0;
    /**
     * Copy the values in the array to host memory.
     * 

--- a/platforms/common/include/openmm/common/ComputeArray.h
+++ b/platforms/common/include/openmm/common/ComputeArray.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2019-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -118,7 +118,20 @@ public:
     *                 have restrictions on non-blocking copies, such as that the source data must be
     *                 in page-locked memory.
     */
-    void upload(const void* data, bool blocking=true);
+    void upload(const void* data, bool blocking=true) {
+        uploadSubArray(data, 0, getSize(), blocking);
+    }
+    /**
+     * Copy values from host memory to a subset of the array.
+     * 
+     * @param data     the data to copy
+     * @param offset   the index of the element within the array at which the copy should begin
+     * @param elements the number of elements to copy
+     * @param blocking if true, this call will block until the transfer is complete.  Subclasses often
+     *                 have restrictions on non-blocking copies, such as that the source data must be
+     *                 in page-locked memory.
+     */
+    void uploadSubArray(const void* data, int offset, int elements, bool blocking=true);
    /**
     * Copy the values in the array to host memory.
     * 

--- a/platforms/common/src/ComputeArray.cpp
+++ b/platforms/common/src/ComputeArray.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2019-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -84,10 +84,10 @@ ComputeContext& ComputeArray::getContext() {
    return impl->getContext();
 }

-void ComputeArray::upload(const void* data, bool blocking) {
+void ComputeArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
    if (impl == NULL)
        throw OpenMMException("ComputeArray has not been initialized");
-    impl->upload(data, blocking);
+    impl->uploadSubArray(data, offset, elements, blocking);
 }

 void ComputeArray::download(void* data, bool blocking) const {

--- a/platforms/cuda/include/CudaArray.h
+++ b/platforms/cuda/include/CudaArray.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -146,13 +146,25 @@ public:
        ArrayInterface::download(data);
    }
    /**
-     * Copy the values in an array to the device memory.
+     * Copy the values from host memory to the array.
     * 
     * @param data     the data to copy
     * @param blocking if true, this call will block until the transfer is complete.  If false,
     *                 the source array  must be in page-locked memory.
     */
-    void upload(const void* data, bool blocking=true);
+    void upload(const void* data, bool blocking=true) {
+        uploadSubArray(data, 0, getSize(), blocking);
+    }
+    /**
+     * Copy values from host memory to a subset of the array.
+     * 
+     * @param data     the data to copy
+     * @param offset   the index of the element within the array at which the copy should begin
+     * @param elements the number of elements to copy
+     * @param blocking if true, this call will block until the transfer is complete.  If false,
+     *                 the source array  must be in page-locked memory.
+     */
+    void uploadSubArray(const void* data, int offset, int elements, bool blocking=true);
    /**
     * Copy the values in the device memory to an array.
     * 

--- a/platforms/cuda/src/CudaArray.cpp
+++ b/platforms/cuda/src/CudaArray.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2012-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2012-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -86,14 +86,16 @@ ComputeContext& CudaArray::getContext() {
    return *context;
 }

-void CudaArray::upload(const void* data, bool blocking) {
+void CudaArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
    if (pointer == 0)
        throw OpenMMException("CudaArray has not been initialized");
+    if (offset < 0 || offset+elements > getSize())
+        throw OpenMMException("uploadSubArray: data exceeds range of array");
    CUresult result;
    if (blocking)
-        result = cuMemcpyHtoD(pointer, data, size*elementSize);
+        result = cuMemcpyHtoD(pointer+offset*elementSize, data, elements*elementSize);
    else
-        result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context->getCurrentStream());
+        result = cuMemcpyHtoDAsync(pointer+offset*elementSize, data, elements*elementSize, context->getCurrentStream());
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
        str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";

--- a/platforms/opencl/include/OpenCLArray.h
+++ b/platforms/opencl/include/OpenCLArray.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -208,12 +208,23 @@ public:
        ArrayInterface::download(data);
    }
    /**
-     * Copy the values in an array to the Buffer.
+     * Copy the values from host memory to the array.
     * 
     * @param data     the data to copy
     * @param blocking if true, this call will block until the transfer is complete.
     */
-    void upload(const void* data, bool blocking=true);
+    void upload(const void* data, bool blocking=true) {
+        uploadSubArray(data, 0, getSize(), blocking);
+    }
+    /**
+     * Copy values from host memory to a subset of the array.
+     * 
+     * @param data     the data to copy
+     * @param offset   the index of the element within the array at which the copy should begin
+     * @param elements the number of elements to copy
+     * @param blocking if true, this call will block until the transfer is complete.
+     */
+    void uploadSubArray(const void* data, int offset, int elements, bool blocking=true);
    /**
     * Copy the values in the Buffer to an array.
     * 

--- a/platforms/opencl/src/OpenCLArray.cpp
+++ b/platforms/opencl/src/OpenCLArray.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2012-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2012-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -96,11 +96,13 @@ ComputeContext& OpenCLArray::getContext() {
    return *context;
 }

-void OpenCLArray::upload(const void* data, bool blocking) {
+void OpenCLArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
    if (buffer == NULL)
        throw OpenMMException("OpenCLArray has not been initialized");
+    if (offset < 0 || offset+elements > getSize())
+        throw OpenMMException("uploadSubArray: data exceeds range of array");
    try {
-        context->getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*elementSize, data);
+        context->getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, offset*elementSize, elements*elementSize, data);
    }
    catch (cl::Error err) {
        std::stringstream str;

--- a/plugins/rpmd/CMakeLists.txt
+++ b/plugins/rpmd/CMakeLists.txt
@@ -108,6 +108,7 @@ ENDIF(OPENMM_BUILD_STATIC_LIB)
 # Which hardware platforms to build

 ADD_SUBDIRECTORY(platforms/reference)
+ADD_SUBDIRECTORY(platforms/common)

 IF(OPENMM_BUILD_OPENCL_LIB)
    SET(OPENMM_BUILD_RPMD_OPENCL_LIB ON CACHE BOOL "Build RPMD implementation for OpenCL")

--- a/plugins/rpmd/openmmapi/include/openmm/RpmdKernels.h
+++ b/plugins/rpmd/openmmapi/include/openmm/RpmdKernels.h
@@ -69,11 +69,11 @@ public:
     */
    virtual void execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) = 0;
    /**
-     * Get the positions of all particles in one copy of the system.
+     * Set the positions of all particles in one copy of the system.
     */
    virtual void setPositions(int copy, const std::vector<Vec3>& positions) = 0;
    /**
-     * Get the velocities of all particles in one copy of the system.
+     * Set the velocities of all particles in one copy of the system.
     */
    virtual void setVelocities(int copy, const std::vector<Vec3>& velocities) = 0;
    /**

--- a/plugins/rpmd/platforms/common/CMakeLists.txt
+++ b/plugins/rpmd/platforms/common/CMakeLists.txt
+# Encode the kernel sources into a C++ class.
+
+SET(KERNEL_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
+SET(KERNEL_SOURCE_CLASS CommonRpmdKernelSources)
+SET(KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.cpp)
+SET(KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.h)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
+FILE(GLOB COMMON_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cc)
+ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
+    COMMAND ${CMAKE_COMMAND}
+    ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cc -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
+    DEPENDS ${COMMON_KERNELS}
+)
+SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
+ADD_CUSTOM_TARGET(RpmdCommonKernels DEPENDS ${KERNELS_CPP} ${KERNELS_H})
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelSources.cpp.in
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelSources.cpp.in
@@ -24,7 +24,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "CudaRpmdKernelSources.h"
+#include "CommonRpmdKernelSources.h"

 using namespace OpenMM;
 using namespace std;

--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelSources.h.in
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelSources.h.in
-#ifndef OPENMM_CUDARPMDKERNELSOURCES_H_
-#define OPENMM_CUDARPMDKERNELSOURCES_H_
+#ifndef OPENMM_COMMONRPMDKERNELSOURCES_H_
+#define OPENMM_COMMONRPMDKERNELSOURCES_H_

 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
@@ -32,16 +32,16 @@
 namespace OpenMM {

 /**
- * This class is a central holding place for the source code of CUDA kernels.
- * The CMake build script inserts declarations into it based on the .cu files in the
+ * This class is a central holding place for the source code of device kernels.
+ * The CMake build script inserts declarations into it based on the .cc files in the
 * kernels subfolder.
 */

-class CudaRpmdKernelSources {
+class CommonRpmdKernelSources {
 public:
 @KERNEL_FILE_DECLARATIONS@
 };

 } // namespace OpenMM

-#endif /*OPENMM_CUDARPMDKERNELSOURCES_H_*/
+#endif /*OPENMM_COMMONRPMDKERNELSOURCES_H_*/
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2020 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -29,13 +29,12 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#include "CudaRpmdKernels.h"
-#include "CudaRpmdKernelSources.h"
+#include "CommonRpmdKernels.h"
+#include "CommonRpmdKernelSources.h"
 #include "openmm/internal/ContextImpl.h"
-#include "CudaIntegrationUtilities.h"
-#include "CudaExpressionUtilities.h"
-#include "CudaKernelSources.h"
-#include "CudaNonbondedUtilities.h"
+#include "openmm/common/IntegrationUtilities.h"
+#include "openmm/common/ExpressionUtilities.h"
+#include "openmm/common/NonbondedUtilities.h"
 #include "SimTKOpenMMRealType.h"

 using namespace OpenMM;
@@ -62,39 +61,39 @@ static int findFFTDimension(int minimum) {
    }
 }

-void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
-    cu.getPlatformData().initializeContexts(system);
+void CommonIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
+    cc.initializeContexts();
    numCopies = integrator.getNumCopies();
    numParticles = system.getNumParticles();
    workgroupSize = numCopies;
    if (numCopies != findFFTDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
-    int paddedParticles = cu.getPaddedNumAtoms();
-    bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
-    int elementSize = (useDoublePrecision ? sizeof(double4) : sizeof(float4));
-    forces.initialize<long long>(cu, numCopies*paddedParticles*3, "rpmdForces");
-    positions.initialize(cu, numCopies*paddedParticles, elementSize, "rpmdPositions");
-    velocities.initialize(cu, numCopies*paddedParticles, elementSize, "rpmdVelocities");
-    cu.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
+    int paddedParticles = cc.getPaddedNumAtoms();
+    bool useDoublePrecision = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision());
+    int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
+    forces.initialize<long long>(cc, numCopies*paddedParticles*3, "rpmdForces");
+    positions.initialize(cc, numCopies*paddedParticles, elementSize, "rpmdPositions");
+    velocities.initialize(cc, numCopies*paddedParticles, elementSize, "rpmdVelocities");
+    cc.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
    if (useDoublePrecision) {
-        vector<double4> temp(positions.getSize());
+        vector<mm_double4> temp(positions.getSize());
        for (int i = 0; i < positions.getSize(); i++)
-            temp[i] = make_double4(0, 0, 0, 0);
+            temp[i] = mm_double4(0, 0, 0, 0);
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
-            temp[i] = make_double4(0, 0, 0, 1);
+            temp[i] = mm_double4(0, 0, 0, 1);
        velocities.upload(temp);
    }
    else {
-        vector<float4> temp(positions.getSize());
+        vector<mm_float4> temp(positions.getSize());
        for (int i = 0; i < positions.getSize(); i++)
-            temp[i] = make_float4(0, 0, 0, 0);
+            temp[i] = mm_float4(0, 0, 0, 0);
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
-            temp[i] = make_float4(0, 0, 0, 1);
+            temp[i] = mm_float4(0, 0, 0, 1);
        velocities.upload(temp);
    }
    
@@ -125,54 +124,99 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
    }
    groupsNotContracted &= integrator.getIntegrationForceGroups();
    if (maxContractedCopies > 0) {
-        contractedForces.initialize<long long>(cu, maxContractedCopies*paddedParticles*3, "rpmdContractedForces");
-        contractedPositions.initialize(cu, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
+        contractedForces.initialize<long long>(cc, maxContractedCopies*paddedParticles*3, "rpmdContractedForces");
+        contractedPositions.initialize(cc, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
    }

    // Create kernels.
    
    map<string, string> defines;
-    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-    defines["NUM_COPIES"] = cu.intToString(numCopies);
-    defines["THREAD_BLOCK_SIZE"] = cu.intToString(workgroupSize);
-    defines["HBAR"] = cu.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
-    defines["SCALE"] = cu.doubleToString(1.0/sqrt((double) numCopies));
-    defines["M_PI"] = cu.doubleToString(M_PI);
+    defines["NUM_ATOMS"] = cc.intToString(cc.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = cc.intToString(cc.getPaddedNumAtoms());
+    defines["NUM_COPIES"] = cc.intToString(numCopies);
+    defines["THREAD_BLOCK_SIZE"] = cc.intToString(workgroupSize);
+    defines["HBAR"] = cc.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
+    defines["SCALE"] = cc.doubleToString(1.0/sqrt((double) numCopies));
+    defines["M_PI"] = cc.doubleToString(M_PI);
    map<string, string> replacements;
    replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
    replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
    replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
    replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
-    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmd, replacements), defines, "");
-    pileKernel = cu.getKernel(module, "applyPileThermostat");
-    stepKernel = cu.getKernel(module, "integrateStep");
-    velocitiesKernel = cu.getKernel(module, "advanceVelocities");
-    copyToContextKernel = cu.getKernel(module, "copyDataToContext");
-    copyFromContextKernel = cu.getKernel(module, "copyDataFromContext");
-    translateKernel = cu.getKernel(module, "applyCellTranslations");
+    ComputeProgram program = cc.compileProgram(cc.replaceStrings(CommonRpmdKernelSources::rpmd, replacements), defines);
+    pileKernel = program->createKernel("applyPileThermostat");
+    stepKernel = program->createKernel("integrateStep");
+    velocitiesKernel = program->createKernel("advanceVelocities");
+    copyToContextKernel = program->createKernel("copyDataToContext");
+    copyFromContextKernel = program->createKernel("copyDataFromContext");
+    translateKernel = program->createKernel("applyCellTranslations");
    
    // Create kernels for doing contractions.
    
    for (auto& g : groupsByCopies) {
        int copies = g.first;
        replacements.clear();
-        replacements["NUM_CONTRACTED_COPIES"] = cu.intToString(copies);
-        replacements["POS_SCALE"] = cu.doubleToString(1.0/numCopies);
-        replacements["FORCE_SCALE"] = cu.doubleToString(0x100000000/(double) copies);
+        replacements["NUM_CONTRACTED_COPIES"] = cc.intToString(copies);
+        replacements["POS_SCALE"] = cc.doubleToString(1.0/numCopies);
+        replacements["FORCE_SCALE"] = cc.doubleToString(0x100000000/(double) copies);
        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
-        module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmdContraction, replacements), defines, "");
-        positionContractionKernels[copies] = cu.getKernel(module, "contractPositions");
-        forceContractionKernels[copies] = cu.getKernel(module, "contractForces");
+        program = cc.compileProgram(cc.replaceStrings(CommonRpmdKernelSources::rpmdContraction, replacements), defines);
+        positionContractionKernels[copies] = program->createKernel("contractPositions");
+        forceContractionKernels[copies] = program->createKernel("contractForces");
    }
 }

-void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
-    cu.setAsCurrent();
-    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+void CommonIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
+    hasInitializedKernels = true;
+    pileKernel->addArg(velocities);
+    pileKernel->addArg(cc.getIntegrationUtilities().getRandom());
+    pileKernel->addArg();
+    pileKernel->addArg();
+    pileKernel->addArg();
+    pileKernel->addArg();
+    stepKernel->addArg(positions);
+    stepKernel->addArg(velocities);
+    stepKernel->addArg(forces);
+    stepKernel->addArg();
+    stepKernel->addArg();
+    velocitiesKernel->addArg(velocities);
+    velocitiesKernel->addArg(forces);
+    velocitiesKernel->addArg();
+    translateKernel->addArg(positions);
+    translateKernel->addArg(cc.getPosq());
+    translateKernel->addArg(cc.getAtomIndexArray());
+    translateKernel->addArg();
+    copyToContextKernel->addArg(velocities);
+    copyToContextKernel->addArg(cc.getVelm());
+    copyToContextKernel->addArg();
+    copyToContextKernel->addArg(cc.getPosq());
+    copyToContextKernel->addArg(cc.getAtomIndexArray());
+    copyToContextKernel->addArg();
+    copyFromContextKernel->addArg(cc.getLongForceBuffer());
+    copyFromContextKernel->addArg();
+    copyFromContextKernel->addArg(cc.getVelm());
+    copyFromContextKernel->addArg(velocities);
+    copyFromContextKernel->addArg(cc.getPosq());
+    copyFromContextKernel->addArg();
+    copyFromContextKernel->addArg(cc.getAtomIndexArray());
+    copyFromContextKernel->addArg();
+    for (auto& g : groupsByCopies) {
+        int copies = g.first;
+        positionContractionKernels[copies]->addArg(positions);
+        positionContractionKernels[copies]->addArg(contractedPositions);
+        forceContractionKernels[copies]->addArg(forces);
+        forceContractionKernels[copies]->addArg(contractedForces);
+    }
+}
+
+void CommonIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
+    cc.setAsCurrent();
+    if (!hasInitializedKernels)
+        initializeKernels(context);
+    IntegrationUtilities& integration = cc.getIntegrationUtilities();
    
    // Loop over copies and compute the force on each one.
    
@@ -181,25 +225,31 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
    
    // Apply the PILE-L thermostat.
    
-    bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
+    bool useDoublePrecision = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision());
    double dt = integrator.getStepSize();
-    float dtFloat = (float) dt;
-    void* dtPtr = (useDoublePrecision ? (void*) &dt : (void*) &dtFloat);
-    double kT = integrator.getTemperature()*BOLTZ;
-    float kTFloat = (float) kT;
-    void* kTPtr = (useDoublePrecision ? (void*) &kT : (void*) &kTFloat);
-    double friction = integrator.getFriction();
-    float frictionFloat = (float) friction;
-    void* frictionPtr = (useDoublePrecision ? (void*) &friction : (void*) &frictionFloat);
-    int randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
-    void* pileArgs[] = {&velocities.getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, dtPtr, kTPtr, frictionPtr};
+    pileKernel->setArg(2, integration.prepareRandomNumbers(numParticles*numCopies));
+    if (useDoublePrecision) {
+        pileKernel->setArg(3, dt);
+        pileKernel->setArg(4, integrator.getTemperature()*BOLTZ);
+        pileKernel->setArg(5, integrator.getFriction());
+        stepKernel->setArg(3, dt);
+        stepKernel->setArg(4, integrator.getTemperature()*BOLTZ);
+        velocitiesKernel->setArg(2, dt);
+    }
+    else {
+        pileKernel->setArg(3, (float) dt);
+        pileKernel->setArg(4, (float) (integrator.getTemperature()*BOLTZ));
+        pileKernel->setArg(5, (float) integrator.getFriction());
+        stepKernel->setArg(3, (float) dt);
+        stepKernel->setArg(4, (float) (integrator.getTemperature()*BOLTZ));
+        velocitiesKernel->setArg(2, (float) dt);
+    }
    if (integrator.getApplyThermostat())
-        cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
+        pileKernel->execute(numParticles*numCopies, workgroupSize);

    // Update positions and velocities.
    
-    void* stepArgs[] = {&positions.getDevicePointer(), &velocities.getDevicePointer(), &forces.getDevicePointer(), dtPtr, kTPtr};
-    cu.executeKernel(stepKernel, stepArgs, numParticles*numCopies, workgroupSize);
+    stepKernel->execute(numParticles*numCopies, workgroupSize);

    // Calculate forces based on the updated positions.
    
@@ -207,38 +257,38 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
    
    // Update velocities.

-    void* velocitiesArgs[] = {&velocities.getDevicePointer(), &forces.getDevicePointer(), dtPtr};
-    cu.executeKernel(velocitiesKernel, velocitiesArgs, numParticles*numCopies, workgroupSize);
+    velocitiesKernel->execute(numParticles*numCopies, workgroupSize);

    // Apply the PILE-L thermostat again.

    if (integrator.getApplyThermostat()) {
-        randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
-        cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
+        pileKernel->setArg(2, integration.prepareRandomNumbers(numParticles*numCopies));
+        pileKernel->execute(numParticles*numCopies, workgroupSize);
    }

    // Update the time and step count.

-    cu.setTime(cu.getTime()+dt);
-    cu.setStepCount(cu.getStepCount()+1);
-    cu.reorderAtoms();
-    if (cu.getAtomsWereReordered() && cu.getNonbondedUtilities().getUsePeriodic()) {
+    cc.setTime(cc.getTime()+dt);
+    cc.setStepCount(cc.getStepCount()+1);
+    cc.reorderAtoms();
+    if (cc.getAtomsWereReordered() && cc.getNonbondedUtilities().getUsePeriodic()) {
        // Atoms may have been translated into a different periodic box, so apply
        // the same translation to all the beads.

-        int i = numCopies-1;
-        void* args[] = {&positions.getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
-        cu.executeKernel(translateKernel, args, cu.getNumAtoms());
+        translateKernel->setArg(3, numCopies-1);
+        translateKernel->execute(cc.getNumAtoms());
    }
 }

-void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
+void CommonIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
    // Compute forces from all groups that didn't have a specified contraction.

+    copyToContextKernel->setArg(2, positions);
+    copyFromContextKernel->setArg(1, forces);
+    copyFromContextKernel->setArg(5, positions);
    for (int i = 0; i < numCopies; i++) {
-        void* copyToContextArgs[] = {&velocities.getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions.getDevicePointer(),
-                &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
-        cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
+        copyToContextKernel->setArg(5, i);
+        copyToContextKernel->execute(cc.getNumAtoms());
        context.computeVirtualSites();
        Vec3 initialBox[3];
        context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
@@ -248,55 +298,54 @@ void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
        if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2])
            throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator.  Use RPMDMonteCarloBarostat instead.");
        context.calcForcesAndEnergy(true, false, groupsNotContracted);
-        void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces.getDevicePointer(), &cu.getVelm().getDevicePointer(),
-                &velocities.getDevicePointer(), &cu.getPosq().getDevicePointer(), &positions.getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
-        cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
+        copyFromContextKernel->setArg(7, i);
+        copyFromContextKernel->execute(cc.getNumAtoms());
    }
    
    // Now loop over contractions and compute forces from them.
    
-    for (auto& g : groupsByCopies) {
-        int copies = g.first;
-        int groupFlags = g.second;
-        
-        // Find the contracted positions.
-        
-        void* contractPosArgs[] = {&positions.getDevicePointer(), &contractedPositions.getDevicePointer()};
-        cu.executeKernel(positionContractionKernels[copies], contractPosArgs, numParticles*numCopies, workgroupSize);
-
-        // Compute forces.
-
-        for (int i = 0; i < copies; i++) {
-            void* copyToContextArgs[] = {&velocities.getDevicePointer(), &cu.getVelm().getDevicePointer(), &contractedPositions.getDevicePointer(),
-                    &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
-            cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
-            context.computeVirtualSites();
-            context.calcForcesAndEnergy(true, false, groupFlags);
-            void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &contractedForces.getDevicePointer(), &cu.getVelm().getDevicePointer(),
-                   &velocities.getDevicePointer(), &cu.getPosq().getDevicePointer(), &contractedPositions.getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
-            cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
+    if (groupsByCopies.size() > 0) {
+        copyToContextKernel->setArg(2, contractedPositions);
+        copyFromContextKernel->setArg(1, contractedForces);
+        copyFromContextKernel->setArg(5, contractedPositions);
+        for (auto& g : groupsByCopies) {
+            int copies = g.first;
+            int groupFlags = g.second;
+
+            // Find the contracted positions.
+
+           positionContractionKernels[copies]->execute(numParticles*numCopies, workgroupSize);
+
+            // Compute forces.
+
+            for (int i = 0; i < copies; i++) {
+                copyToContextKernel->setArg(5, i);
+                copyToContextKernel->execute(cc.getNumAtoms());
+                context.computeVirtualSites();
+                context.calcForcesAndEnergy(true, false, groupFlags);
+                copyFromContextKernel->setArg(7, i);
+                copyFromContextKernel->execute(cc.getNumAtoms());
+            }
+
+            // Apply the forces to the original copies.
+
+            forceContractionKernels[copies]->execute(numParticles*numCopies, workgroupSize);
        }
-        
-        // Apply the forces to the original copies.
-        
-        void* contractForceArgs[] = {&forces.getDevicePointer(), &contractedForces.getDevicePointer()};
-        cu.executeKernel(forceContractionKernels[copies], contractForceArgs, numParticles*numCopies, workgroupSize);
    }
    if (groupsByCopies.size() > 0) {
        // Ensure the Context contains the positions from the last copy, since we'll assume that later.
        
-        int i = numCopies-1;
-        void* copyToContextArgs[] = {&velocities.getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions.getDevicePointer(),
-                &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
-        cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
+        copyToContextKernel->setArg(2, positions);
+        copyToContextKernel->setArg(5, numCopies-1);
+        copyToContextKernel->execute(cc.getNumAtoms());
    }
 }

-double CudaIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
-    return cu.getIntegrationUtilities().computeKineticEnergy(0);
+double CommonIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
+    return cc.getIntegrationUtilities().computeKineticEnergy(0);
 }

-void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
+void CommonIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
    if (!positions.isInitialized())
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
@@ -304,80 +353,71 @@ void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos

    // Adjust the positions based on the current cell offsets.
    
-    const vector<int>& order = cu.getAtomIndex();
-    double4 periodicBoxSize = cu.getPeriodicBoxSize();
+    const vector<int>& order = cc.getAtomIndex();
+    Vec3 a, b, c;
+    cc.getPeriodicBoxVectors(a, b, c);
    vector<Vec3> offsetPos(numParticles);
    for (int i = 0; i < numParticles; ++i) {
-        mm_int4 offset = cu.getPosCellOffsets()[i];
-        offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*periodicBoxSize.x, offset.y*periodicBoxSize.y, offset.z*periodicBoxSize.z);
+        mm_int4 offset = cc.getPosCellOffsets()[i];
+        offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*a[0], offset.y*b[1], offset.z*c[2]);
    }

    // Record the positions.

-    CUresult result;
-    if (cu.getUseDoublePrecision()) {
-        vector<double4> posq(cu.getPaddedNumAtoms());
-        cu.getPosq().download(posq);
+    if (cc.getUseDoublePrecision()) {
+        vector<mm_double4> posq(cc.getPaddedNumAtoms());
+        cc.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
-            posq[i] = make_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
-        result = cuMemcpyHtoD(positions.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4));
+            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
+        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
    }
-    else if (cu.getUseMixedPrecision()) {
-        vector<float4> posqf(cu.getPaddedNumAtoms());
-        cu.getPosq().download(posqf);
-        vector<double4> posq(cu.getPaddedNumAtoms());
+    else if (cc.getUseMixedPrecision()) {
+        vector<mm_float4> posqf(cc.getPaddedNumAtoms());
+        cc.getPosq().download(posqf);
+        vector<mm_double4> posq(cc.getPaddedNumAtoms());
        for (int i = 0; i < numParticles; i++)
-            posq[i] = make_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
-        result = cuMemcpyHtoD(positions.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4));
+            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
+        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
    }
    else {
-        vector<float4> posq(cu.getPaddedNumAtoms());
-        cu.getPosq().download(posq);
+        vector<mm_float4> posq(cc.getPaddedNumAtoms());
+        cc.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
-            posq[i] = make_float4((float) offsetPos[i][0], (float) offsetPos[i][1], (float) offsetPos[i][2], posq[i].w);
-        result = cuMemcpyHtoD(positions.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &posq[0], numParticles*sizeof(float4));
-    }
-    if (result != CUDA_SUCCESS) {
-        std::stringstream str;
-        str<<"Error uploading array "<<positions.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
-        throw OpenMMException(str.str());
+            posq[i] = mm_float4((float) offsetPos[i][0], (float) offsetPos[i][1], (float) offsetPos[i][2], posq[i].w);
+        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
    }
 }

-void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
+void CommonIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
    if (!velocities.isInitialized())
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
-    CUresult result;
-    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-        vector<double4> velm(cu.getPaddedNumAtoms());
-        cu.getVelm().download(velm);
+    if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision()) {
+        vector<mm_double4> velm(cc.getPaddedNumAtoms());
+        cc.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
-            velm[i] = make_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
-        result = cuMemcpyHtoD(velocities.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &velm[0], numParticles*sizeof(double4));
+            velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
+        velocities.uploadSubArray(&velm[0], copy*cc.getPaddedNumAtoms(), numParticles);
    }
    else {
-        vector<float4> velm(cu.getPaddedNumAtoms());
-        cu.getVelm().download(velm);
+        vector<mm_float4> velm(cc.getPaddedNumAtoms());
+        cc.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
-            velm[i] = make_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
-        result = cuMemcpyHtoD(velocities.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &velm[0], numParticles*sizeof(float4));
-    }
-    if (result != CUDA_SUCCESS) {
-        std::stringstream str;
-        str<<"Error uploading array "<<velocities.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
-        throw OpenMMException(str.str());
+            velm[i] = mm_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
+        velocities.uploadSubArray(&velm[0], copy*cc.getPaddedNumAtoms(), numParticles);
    }
 }

-void CudaIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
-    void* copyArgs[] = {&velocities.getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions.getDevicePointer(),
-            &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy};
-    cu.executeKernel(copyToContextKernel, copyArgs, cu.getNumAtoms());
+void CommonIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
+    if (!hasInitializedKernels)
+        initializeKernels(context);
+    copyToContextKernel->setArg(2, positions);
+    copyToContextKernel->setArg(5, copy);
+    copyToContextKernel->execute(cc.getNumAtoms());
 }

-string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
+string CommonIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
    stringstream source;
    int stage = 0;
    int L = size;
@@ -387,10 +427,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");

    source<<"{\n";
-    source<<"mixed3* real0 = "<<variable<<"real;\n";
-    source<<"mixed3* imag0 = "<<variable<<"imag;\n";
-    source<<"mixed3* real1 = &temp[blockStart];\n";
-    source<<"mixed3* imag1 = &temp[blockStart+blockDim.x];\n";
+    source<<"LOCAL_ARG mixed3* real0 = "<<variable<<"real;\n";
+    source<<"LOCAL_ARG mixed3* imag0 = "<<variable<<"imag;\n";
+    source<<"LOCAL_ARG mixed3* real1 = &temp[blockStart];\n";
+    source<<"LOCAL_ARG mixed3* imag1 = &temp[blockStart+LOCAL_SIZE];\n";

    // Factor size, generating an appropriate block of code for each factor.

@@ -407,7 +447,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
        else if (L%2 == 0)
            radix = 2;
        else
-            throw OpenMMException("Illegal size for FFT: "+cu.intToString(size));
+            throw OpenMMException("Illegal size for FFT: "+cc.intToString(size));
        source<<"{\n";
        L = L/radix;
        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
@@ -429,21 +469,21 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
            source<<"mixed3 d0i = c1i+c4i;\n";
            source<<"mixed3 d1r = c2r+c3r;\n";
            source<<"mixed3 d1i = c2i+c3i;\n";
-            source<<"mixed3 d2r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
-            source<<"mixed3 d2i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
-            source<<"mixed3 d3r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
-            source<<"mixed3 d3i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
+            source<<"mixed3 d2r = "<<cc.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
+            source<<"mixed3 d2i = "<<cc.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
+            source<<"mixed3 d3r = "<<cc.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
+            source<<"mixed3 d3i = "<<cc.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
            source<<"mixed3 d4r = d0r+d1r;\n";
            source<<"mixed3 d4i = d0i+d1i;\n";
-            source<<"mixed3 d5r = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
-            source<<"mixed3 d5i = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
+            source<<"mixed3 d5r = "<<cc.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
+            source<<"mixed3 d5i = "<<cc.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
            source<<"mixed3 d6r = c0r-0.25f*d4r;\n";
            source<<"mixed3 d6i = c0i-0.25f*d4i;\n";
            source<<"mixed3 d7r = d6r+d5r;\n";
            source<<"mixed3 d7i = d6i+d5i;\n";
            source<<"mixed3 d8r = d6r-d5r;\n";
            source<<"mixed3 d8i = d6i-d5i;\n";
-            string coeff = cu.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
+            string coeff = cc.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
            source<<"mixed3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
            source<<"mixed3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
            source<<"mixed3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
@@ -496,8 +536,8 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
            source<<"mixed3 d0i = c1i+c2i;\n";
            source<<"mixed3 d1r = c0r-0.5f*d0r;\n";
            source<<"mixed3 d1i = c0i-0.5f*d0i;\n";
-            source<<"mixed3 d2r = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
-            source<<"mixed3 d2i = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
+            source<<"mixed3 d2r = "<<sign<<"*"<<cc.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
+            source<<"mixed3 d2i = "<<sign<<"*"<<cc.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
@@ -517,7 +557,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
        }
        source<<"}\n";
        m = m*radix;
-        source<<"__syncthreads();\n";
+        source<<"SYNC_THREADS;\n";
        source<<"}\n";
        ++stage;
    }
@@ -527,6 +567,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
    if (stage%2 == 1) {
        source<<"real0[indexInBlock] = real1[indexInBlock];\n";
        source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
+        source<<"SYNC_WARPS;\n";
    }
    source<<"}\n";
    return source.str();

--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
-#ifndef OPENCL_RPMD_KERNELS_H_
-#define OPENCL_RPMD_KERNELS_H_
+#ifndef COMMON_RPMD_KERNELS_H_
+#define COMMON_RPMD_KERNELS_H_

 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,19 +33,20 @@
 * -------------------------------------------------------------------------- */

 #include "openmm/RpmdKernels.h"
-#include "OpenCLContext.h"
-#include "OpenCLArray.h"
+#include "openmm/common/ComputeContext.h"
+#include "openmm/common/ComputeArray.h"
 #include <map>
+
 namespace OpenMM {

 /**
 * This kernel is invoked by RPMDIntegrator to take one time step, and to get and
 * set the state of system copies.
 */
-class OpenCLIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
+class CommonIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
 public:
-    OpenCLIntegrateRPMDStepKernel(const std::string& name, const Platform& platform, OpenCLContext& cl) :
-            IntegrateRPMDStepKernel(name, platform), cl(cl), hasInitializedKernel(false) {
+    CommonIntegrateRPMDStepKernel(const std::string& name, const Platform& platform, ComputeContext& cc) :
+            IntegrateRPMDStepKernel(name, platform), cc(cc), hasInitializedKernels(false) {
    }
    /**
     * Initialize the kernel.
@@ -71,11 +72,11 @@ public:
     */
    double computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator);
    /**
-     * Get the positions of all particles in one copy of the system.
+     * Set the positions of all particles in one copy of the system.
     */
    void setPositions(int copy, const std::vector<Vec3>& positions);
    /**
-     * Get the velocities of all particles in one copy of the system.
+     * Set the velocities of all particles in one copy of the system.
     */
    void setVelocities(int copy, const std::vector<Vec3>& velocities);
    /**
@@ -86,21 +87,21 @@ private:
    void initializeKernels(ContextImpl& context);
    void computeForces(ContextImpl& context);
    std::string createFFT(int size, const std::string& variable, bool forward);
-    OpenCLContext& cl;
-    bool hasInitializedKernel;
+    ComputeContext& cc;
+    bool hasInitializedKernels;
    int numCopies, numParticles, workgroupSize;
    std::map<int, int> groupsByCopies;
    int groupsNotContracted;
-    OpenCLArray forces;
-    OpenCLArray positions;
-    OpenCLArray velocities;
-    OpenCLArray contractedForces;
-    OpenCLArray contractedPositions;
-    cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
-    std::map<int, cl::Kernel> positionContractionKernels;
-    std::map<int, cl::Kernel> forceContractionKernels;
+    ComputeArray forces;
+    ComputeArray positions;
+    ComputeArray velocities;
+    ComputeArray contractedForces;
+    ComputeArray contractedPositions;
+    ComputeKernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+    std::map<int, ComputeKernel> positionContractionKernels;
+    std::map<int, ComputeKernel> forceContractionKernels;
 };

 } // namespace OpenMM

-#endif /*OPENCL_RPMD_KERNELS_H_*/
+#endif /*COMMON_RPMD_KERNELS_H_*/
--- a/plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
+++ b/plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
-__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+DEVICE mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2r-c1.y*c2i;
 }

-__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+DEVICE mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2i+c1.y*c2r;
 }

-__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+DEVICE mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2r+c1.y*c2i;
 }

-__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+DEVICE mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2i-c1.y*c2r;
 }

 /**
 * Apply the PILE-L thermostat.
 */
-extern "C" __global__ void applyPileThermostat(mixed4* velm, float4* random, unsigned int randomIndex,
+KERNEL void applyPileThermostat(GLOBAL mixed4* velm, GLOBAL float4* random, unsigned int randomIndex,
        mixed dt, mixed kT, mixed friction) {
-    const int numBlocks = blockDim.x*gridDim.x/NUM_COPIES;
-    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
-    const int indexInBlock = threadIdx.x-blockStart;
+    const int numBlocks = GLOBAL_SIZE/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
+    const int indexInBlock = LOCAL_ID-blockStart;
    const mixed nkT = NUM_COPIES*kT;
    const mixed twown = 2.0f*nkT/HBAR;
-    const mixed c1_0 = EXP(-0.5f*dt*friction);
-    const mixed c2_0 = SQRT(1.0f-c1_0*c1_0);
-    __shared__ mixed3 v[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed2 w[NUM_COPIES];
-    mixed3* vreal = &v[blockStart];
-    mixed3* vimag = &v[blockStart+blockDim.x];
-    if (threadIdx.x < NUM_COPIES)
+    const mixed c1_0 = exp(-0.5f*dt*friction);
+    const mixed c2_0 = sqrt(1.0f-c1_0*c1_0);
+    LOCAL mixed3 v[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed3 temp[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed2 w[NUM_COPIES];
+    LOCAL_ARG mixed3* vreal = &v[blockStart];
+    LOCAL_ARG mixed3* vimag = &v[blockStart+LOCAL_SIZE];
+    if (LOCAL_ID < NUM_COPIES)
        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
-    __syncthreads();
-    randomIndex += NUM_COPIES*((blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES);
-    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+    SYNC_THREADS;
+    randomIndex += NUM_COPIES*((GLOBAL_ID)/NUM_COPIES);
+    for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
        mixed invMass = particleVelm.w;
-        mixed c3_0 = c2_0*SQRT(nkT*invMass);
+        mixed c3_0 = c2_0*sqrt(nkT*invMass);
        
        // Forward FFT.
        
        vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
        vimag[indexInBlock] = make_mixed3(0);
-        __syncthreads();
+        SYNC_THREADS;
        FFT_V_FORWARD

        // Apply the thermostat.
@@ -61,43 +61,43 @@ extern "C" __global__ void applyPileThermostat(mixed4* velm, float4* random, uns
            int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
            const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
            const mixed wk = twown*sin(k*M_PI/NUM_COPIES);
-            const mixed c1 = EXP(-wk*dt);
-            const mixed c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
-            const mixed c3 = c2*SQRT(nkT*invMass);
+            const mixed c1 = exp(-wk*dt);
+            const mixed c2 = sqrt((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
+            const mixed c3 = c2*sqrt(nkT*invMass);
            float4 rand1 = random[randomIndex+k];
            float4 rand2 = (isCenter ? make_float4(0) : random[randomIndex+NUM_COPIES-k]);
            vreal[indexInBlock] = c1*vreal[indexInBlock] + c3*make_mixed3(rand1.x, rand1.y, rand1.z);
            vimag[indexInBlock] = c1*vimag[indexInBlock] + c3*(indexInBlock < NUM_COPIES/2 ? make_mixed3(rand2.x, rand2.y, rand2.z) : make_mixed3(-rand2.x, -rand2.y, -rand2.z));
        }
-        __syncthreads();
+        SYNC_THREADS;
        
        // Inverse FFT.
        
        FFT_V_BACKWARD
        if (invMass != 0)
            velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
-        randomIndex += blockDim.x*gridDim.x;
+        randomIndex += GLOBAL_SIZE;
    }
 }

 /**
 * Advance the positions and velocities.
 */
-extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* force, mixed dt, mixed kT) {
-    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
-    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
-    const int indexInBlock = threadIdx.x-blockStart;
+KERNEL void integrateStep(GLOBAL mixed4* posq, GLOBAL mixed4* velm, GLOBAL mm_long* force, mixed dt, mixed kT) {
+    const int numBlocks = (GLOBAL_SIZE)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
+    const int indexInBlock = LOCAL_ID-blockStart;
    const mixed nkT = NUM_COPIES*kT;
    const mixed twown = 2.0f*nkT/HBAR;
    const mixed forceScale = 1/(mixed) 0x100000000;
-    __shared__ mixed3 q[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed3 v[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed2 w[NUM_COPIES];
+    LOCAL mixed3 q[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed3 v[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed3 temp[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed2 w[NUM_COPIES];

    // Update velocities.
    
-    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+    for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
        mixed4 particleVelm = velm[index];
@@ -110,14 +110,14 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long*
    
    // Evolve the free ring polymer by transforming to the frequency domain.

-    mixed3* qreal = &q[blockStart];
-    mixed3* qimag = &q[blockStart+blockDim.x];
-    mixed3* vreal = &v[blockStart];
-    mixed3* vimag = &v[blockStart+blockDim.x];
-    if (threadIdx.x < NUM_COPIES)
+    LOCAL_ARG mixed3* qreal = &q[blockStart];
+    LOCAL_ARG mixed3* qimag = &q[blockStart+LOCAL_SIZE];
+    LOCAL_ARG mixed3* vreal = &v[blockStart];
+    LOCAL_ARG mixed3* vimag = &v[blockStart+LOCAL_SIZE];
+    if (LOCAL_ID < NUM_COPIES)
        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
-    __syncthreads();
-    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+    SYNC_THREADS;
+    for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
        
@@ -127,7 +127,7 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long*
        qimag[indexInBlock] = make_mixed3(0);
        vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
        vimag[indexInBlock] = make_mixed3(0);
-        __syncthreads();
+        SYNC_THREADS;
        FFT_Q_FORWARD
        FFT_V_FORWARD

@@ -149,7 +149,7 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long*
            vreal[indexInBlock] = vprimereal;
            vimag[indexInBlock] = vprimeimag;
        }
-        __syncthreads();
+        SYNC_THREADS;
        
        // Inverse FFT.
        
@@ -165,15 +165,15 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long*
 /**
 * Advance the velocities by a half step.
 */
-extern "C" __global__ void advanceVelocities(mixed4* velm, long long* force, mixed dt) {
-    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
-    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
-    const int indexInBlock = threadIdx.x-blockStart;
+KERNEL void advanceVelocities(GLOBAL mixed4* velm, GLOBAL mm_long* force, mixed dt) {
+    const int numBlocks = (GLOBAL_SIZE)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
+    const int indexInBlock = LOCAL_ID-blockStart;
    const mixed forceScale = 1/(mixed) 0x100000000;

    // Update velocities.
    
-    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+    for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
        mixed4 particleVelm = velm[index];
@@ -188,9 +188,9 @@ extern "C" __global__ void advanceVelocities(mixed4* velm, long long* force, mix
 /**
 * Copy a set of positions and velocities from the integrator's arrays to the context.
 */
-extern "C" __global__ void copyDataToContext(mixed4* srcVel, mixed4* dstVel, mixed4* srcPos, real4* dstPos, int* order, int copy) {
+KERNEL void copyDataToContext(GLOBAL mixed4* srcVel, GLOBAL mixed4* dstVel, GLOBAL mixed4* srcPos, GLOBAL real4* dstPos, GLOBAL int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS;
-    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
+    for (int particle = GLOBAL_ID; particle < NUM_ATOMS; particle += GLOBAL_SIZE) {
        int index = base+order[particle];
        dstVel[particle] = srcVel[index];
        mixed4 posq = srcPos[index];
@@ -203,10 +203,10 @@ extern "C" __global__ void copyDataToContext(mixed4* srcVel, mixed4* dstVel, mix
 /**
 * Copy a set of positions, velocities, and forces from the context to the integrator's arrays.
 */
-extern "C" __global__ void copyDataFromContext(long long* srcForce, long long* dstForce, mixed4* srcVel, mixed4* dstVel,
-        real4* srcPos, mixed4* dstPos, int* order, int copy) {
+KERNEL void copyDataFromContext(GLOBAL mm_long* srcForce, GLOBAL mm_long* dstForce, GLOBAL mixed4* srcVel, GLOBAL mixed4* dstVel,
+        GLOBAL real4* srcPos, GLOBAL mixed4* dstPos, GLOBAL int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS;
-    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
+    for (int particle = GLOBAL_ID; particle < NUM_ATOMS; particle += GLOBAL_SIZE) {
        int index = order[particle];
        dstForce[base*3+index] = srcForce[particle];
        dstForce[base*3+index+PADDED_NUM_ATOMS] = srcForce[particle+PADDED_NUM_ATOMS];
@@ -223,8 +223,8 @@ extern "C" __global__ void copyDataFromContext(long long* srcForce, long long* d
 /**
 * Atom positions in one copy have been modified.  Apply the same offsets to all the other copies.
 */
-extern "C" __global__ void applyCellTranslations(mixed4* posq, real4* movedPos, int* order, int movedCopy) {
-    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
+KERNEL void applyCellTranslations(GLOBAL mixed4* posq, GLOBAL real4* movedPos, GLOBAL int* order, int movedCopy) {
+    for (int particle = GLOBAL_ID; particle < NUM_ATOMS; particle += GLOBAL_SIZE) {
        int index = order[particle];
        real4 p = movedPos[particle];
        mixed4 delta = make_mixed4(p.x, p.y, p.z, p.w)-posq[movedCopy*PADDED_NUM_ATOMS+index];

--- a/plugins/rpmd/platforms/cuda/src/kernels/rpmdContraction.cu
+++ b/plugins/rpmd/platforms/cuda/src/kernels/rpmdContraction.cu
-__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+DEVICE mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2r-c1.y*c2i;
 }

-__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+DEVICE mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2i+c1.y*c2r;
 }

-__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+DEVICE mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2r+c1.y*c2i;
 }

-__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+DEVICE mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2i-c1.y*c2r;
 }

 /**
 * Compute the contracted positions
 */
-extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
-    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
-    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
-    const int indexInBlock = threadIdx.x-blockStart;
-    __shared__ mixed3 q[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed2 w1[NUM_COPIES];
-    __shared__ mixed2 w2[NUM_CONTRACTED_COPIES];
-    mixed3* qreal = &q[blockStart];
-    mixed3* qimag = &q[blockStart+blockDim.x];
-    mixed3* tempreal = &temp[blockStart];
-    mixed3* tempimag = &temp[blockStart+blockDim.x];
-    if (threadIdx.x < NUM_COPIES)
+KERNEL void contractPositions(GLOBAL mixed4* posq, GLOBAL mixed4* contracted) {
+    const int numBlocks = (GLOBAL_SIZE)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
+    const int indexInBlock = LOCAL_ID-blockStart;
+    LOCAL mixed3 q[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed3 temp[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed2 w1[NUM_COPIES];
+    LOCAL mixed2 w2[NUM_CONTRACTED_COPIES];
+    LOCAL_ARG mixed3* qreal = &q[blockStart];
+    LOCAL_ARG mixed3* qimag = &q[blockStart+LOCAL_SIZE];
+    LOCAL_ARG mixed3* tempreal = &temp[blockStart];
+    LOCAL_ARG mixed3* tempimag = &temp[blockStart+LOCAL_SIZE];
+    if (LOCAL_ID < NUM_COPIES)
        w1[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
-    if (threadIdx.x < NUM_CONTRACTED_COPIES)
+    if (LOCAL_ID < NUM_CONTRACTED_COPIES)
        w2[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES), sin(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES));
-    __syncthreads();
-    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+    SYNC_THREADS;
+    for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        // Load the particle position.
        
        mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
@@ -43,8 +43,8 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
        
        // Forward FFT.
        
-        __syncthreads();
-        mixed2* w = w1;
+        SYNC_THREADS;
+        LOCAL_ARG mixed2* w = w1;
        FFT_Q_FORWARD
        if (NUM_CONTRACTED_COPIES > 1) {
            // Compress the data to remove high frequencies.
@@ -52,12 +52,12 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
            int start = (NUM_CONTRACTED_COPIES+1)/2;
            tempreal[indexInBlock] = qreal[indexInBlock];
            tempimag[indexInBlock] = qimag[indexInBlock];
-            __syncthreads();
+            SYNC_THREADS;
            if (indexInBlock < NUM_CONTRACTED_COPIES) {
                qreal[indexInBlock] = tempreal[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
                qimag[indexInBlock] = tempimag[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
            }
-            __syncthreads();
+            SYNC_THREADS;
            w = w2;
            FFT_Q_BACKWARD
        }
@@ -72,25 +72,25 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
 /**
 * Apply the contracted forces to all copies.
 */
-extern "C" __global__ void contractForces(long long* force, long long* contracted) {
-    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
-    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
-    const int indexInBlock = threadIdx.x-blockStart;
+KERNEL void contractForces(GLOBAL mm_long* force, GLOBAL mm_long* contracted) {
+    const int numBlocks = (GLOBAL_SIZE)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
+    const int indexInBlock = LOCAL_ID-blockStart;
    const mixed forceScale = 1/(mixed) 0x100000000;
-    __shared__ mixed3 f[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
-    __shared__ mixed2 w1[NUM_COPIES];
-    __shared__ mixed2 w2[NUM_CONTRACTED_COPIES];
-    mixed3* freal = &f[blockStart];
-    mixed3* fimag = &f[blockStart+blockDim.x];
-    mixed3* tempreal = &temp[blockStart];
-    mixed3* tempimag = &temp[blockStart+blockDim.x];
-    if (threadIdx.x < NUM_COPIES)
+    LOCAL mixed3 f[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed3 temp[2*THREAD_BLOCK_SIZE];
+    LOCAL mixed2 w1[NUM_COPIES];
+    LOCAL mixed2 w2[NUM_CONTRACTED_COPIES];
+    LOCAL_ARG mixed3* freal = &f[blockStart];
+    LOCAL_ARG mixed3* fimag = &f[blockStart+LOCAL_SIZE];
+    LOCAL_ARG mixed3* tempreal = &temp[blockStart];
+    LOCAL_ARG mixed3* tempimag = &temp[blockStart+LOCAL_SIZE];
+    if (LOCAL_ID < NUM_COPIES)
        w1[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
-    if (threadIdx.x < NUM_CONTRACTED_COPIES)
+    if (LOCAL_ID < NUM_CONTRACTED_COPIES)
        w2[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES), sin(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES));
-    __syncthreads();
-    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+    SYNC_THREADS;
+    for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        // Load the force.
        
        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
@@ -98,11 +98,11 @@ extern "C" __global__ void contractForces(long long* force, long long* contracte
            freal[indexInBlock] = make_mixed3(contracted[forceIndex]*forceScale, contracted[forceIndex+PADDED_NUM_ATOMS]*forceScale, contracted[forceIndex+PADDED_NUM_ATOMS*2]*forceScale);
            fimag[indexInBlock] = make_mixed3(0);
        }
-        __syncthreads();
+        SYNC_THREADS;

        // Forward FFT.
        
-        mixed2* w = w2;
+        LOCAL_ARG mixed2* w = w2;
        if (NUM_CONTRACTED_COPIES > 1) {
            FFT_F_FORWARD
        }
@@ -113,19 +113,19 @@ extern "C" __global__ void contractForces(long long* force, long long* contracte
        int end = NUM_COPIES-NUM_CONTRACTED_COPIES+start;
        tempreal[indexInBlock] = freal[indexInBlock];
        tempimag[indexInBlock] = fimag[indexInBlock];
-        __syncthreads();
+        SYNC_THREADS;
        if (indexInBlock >= start) {
            freal[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempreal[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
            fimag[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempimag[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
        }
-        __syncthreads();
+        SYNC_THREADS;
        w = w1;
        FFT_F_BACKWARD
        
        // Store results.
        
-        force[forceIndex] += (long long) (FORCE_SCALE*freal[indexInBlock].x);
-        force[forceIndex+PADDED_NUM_ATOMS] += (long long) (FORCE_SCALE*freal[indexInBlock].y);
-        force[forceIndex+PADDED_NUM_ATOMS*2] += (long long) (FORCE_SCALE*freal[indexInBlock].z);
+        force[forceIndex] += (mm_long) (FORCE_SCALE*freal[indexInBlock].x);
+        force[forceIndex+PADDED_NUM_ATOMS] += (mm_long) (FORCE_SCALE*freal[indexInBlock].y);
+        force[forceIndex+PADDED_NUM_ATOMS*2] += (mm_long) (FORCE_SCALE*freal[indexInBlock].z);
    }
 }
--- a/plugins/rpmd/platforms/cuda/CMakeLists.txt
+++ b/plugins/rpmd/platforms/cuda/CMakeLists.txt
@@ -12,7 +12,7 @@

 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-SET(OPENMM_SOURCE_SUBDIRS .)
+SET(OPENMM_SOURCE_SUBDIRS . ../common)


 # Collect up information about the version of the OpenMM library we're building
@@ -59,33 +59,25 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
 ENDFOREACH(subdir)

+SET(COMMON_KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/../common/src/CommonRpmdKernelSources.cpp)
+SET(SOURCE_FILES ${SOURCE_FILES} ${COMMON_KERNELS_CPP})
+
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/../common/src)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/include)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/src)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/platforms/cuda/src)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/common/include)
-
-# Set variables needed for encoding kernel sources into a C++ class
-
-SET(KERNEL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
-SET(KERNEL_SOURCE_CLASS CudaRpmdKernelSources)
-SET(KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.cpp)
-SET(KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.h)
-SET(SOURCE_FILES ${SOURCE_FILES} ${KERNELS_CPP} ${KERNELS_H})
-INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/platforms/common/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/../common/src)

 # Create the library

 INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})

-FILE(GLOB CUDA_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cu)
-ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
-    COMMAND ${CMAKE_COMMAND}
-    ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cu -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
-    DEPENDS ${CUDA_KERNELS}
-)
-SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
+SET_SOURCE_FILES_PROPERTIES(${COMMON_KERNELS_CPP} PROPERTIES GENERATED TRUE)
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+ADD_DEPENDENCIES(${SHARED_TARGET} RpmdCommonKernels)

 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)

--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelFactory.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelFactory.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -27,7 +27,8 @@
 #include <exception>

 #include "CudaRpmdKernelFactory.h"
-#include "CudaRpmdKernels.h"
+#include "CommonRpmdKernels.h"
+#include "CudaContext.h"
 #include "openmm/internal/windowsExport.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/OpenMMException.h"
@@ -61,6 +62,6 @@ extern "C" OPENMM_EXPORT void registerRPMDCudaKernelFactories() {
 KernelImpl* CudaRpmdKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
    CudaContext& cl = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData())->contexts[0];
    if (name == IntegrateRPMDStepKernel::Name())
-        return new CudaIntegrateRPMDStepKernel(name, platform, cl);
+        return new CommonIntegrateRPMDStepKernel(name, platform, cl);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
 }
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
-#ifndef CUDA_RPMD_KERNELS_H_
-#define CUDA_RPMD_KERNELS_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2011-2018 Stanford University and the Authors.      *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include "openmm/RpmdKernels.h"
-#include "CudaContext.h"
-#include "CudaArray.h"
-#include <map>
-
-namespace OpenMM {
-
-/**
- * This kernel is invoked by RPMDIntegrator to take one time step, and to get and
- * set the state of system copies.
- */
-class CudaIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
-public:
-    CudaIntegrateRPMDStepKernel(const std::string& name, const Platform& platform, CudaContext& cu) :
-            IntegrateRPMDStepKernel(name, platform), cu(cu) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param integrator the RPMDIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const RPMDIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param integrator     the RPMDIntegrator this kernel is being used for
-     * @param forcesAreValid if the context has been modified since the last time step, this will be
-     *                       false to show that cached forces are invalid and must be recalculated
-     */
-    void execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid);
-    /**
-     * Compute the kinetic energy.
-     * 
-     * @param context        the context in which to execute this kernel
-     * @param integrator     the RPMDIntegrator this kernel is being used for
-     */
-    double computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator);
-    /**
-     * Get the positions of all particles in one copy of the system.
-     */
-    void setPositions(int copy, const std::vector<Vec3>& positions);
-    /**
-     * Get the velocities of all particles in one copy of the system.
-     */
-    void setVelocities(int copy, const std::vector<Vec3>& velocities);
-    /**
-     * Copy positions and velocities for one copy into the context.
-     */
-    void copyToContext(int copy, ContextImpl& context);
-private:
-    void computeForces(ContextImpl& context);
-    std::string createFFT(int size, const std::string& variable, bool forward);
-    CudaContext& cu;
-    int numCopies, numParticles, workgroupSize;
-    std::map<int, int> groupsByCopies;
-    int groupsNotContracted;
-    CudaArray forces;
-    CudaArray positions;
-    CudaArray velocities;
-    CudaArray contractedForces;
-    CudaArray contractedPositions;
-    CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
-    std::map<int, CUfunction> positionContractionKernels;
-    std::map<int, CUfunction> forceContractionKernels;
-};
-
-} // namespace OpenMM
-
-#endif /*CUDA_RPMD_KERNELS_H_*/
--- a/plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
@@ -5,6 +5,8 @@
 ENABLE_TESTING()

 INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${OPENMM_DIR}/plugins/rpmd/tests)
+INCLUDE_DIRECTORIES(${OPENMM_DIR}/platforms/cuda/tests)

 # Automatically create tests using files named "Test*.cpp"
 FILE(GLOB TEST_PROGS "*Test*.cpp")