Created CUDA implementation of RPMD

05198df6 · Peter Eastman · 99bca8c1 · 05198df6 · 05198df6 · 05198df6
Commit 05198df6 authored Nov 05, 2012 by Peter Eastman
15 changed files
--- a/plugins/rpmd/CMakeLists.txt
+++ b/plugins/rpmd/CMakeLists.txt
@@ -170,6 +170,15 @@ IF(OPENMM_BUILD_RPMD_OPENCL_LIB)
    ADD_SUBDIRECTORY(platforms/opencl)
 ENDIF(OPENMM_BUILD_RPMD_OPENCL_LIB)
+IF(CUDA_FOUND)
+    SET(OPENMM_BUILD_RPMD_CUDA_LIB ON CACHE BOOL "Build RPMD implementation for CUDA")
+ELSE(CUDA_FOUND)
+    SET(OPENMM_BUILD_RPMD_CUDA_LIB OFF CACHE BOOL "Build RPMD implementation for CUDA")
+ENDIF(CUDA_FOUND)
+IF(OPENMM_BUILD_RPMD_CUDA_LIB)
+    ADD_SUBDIRECTORY(platforms/cuda)
+ENDIF(OPENMM_BUILD_RPMD_CUDA_LIB)
 INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${SHARED_RPMD_TARGET})
 IF( CREATE_SERIALIZABLE_OPENMM_RPMD )
    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_RPMD_SERIALIZABLE_TARGET})

--- a/plugins/rpmd/openmmapi/include/openmm/RPMDIntegrator.h
+++ b/plugins/rpmd/openmmapi/include/openmm/RPMDIntegrator.h
@@ -182,7 +182,7 @@ protected:
 private:
    double temperature, friction;
    int numCopies, randomNumberSeed;
-    bool forcesAreValid, hasSetPosition, hasSetVelocity;
+    bool forcesAreValid, hasSetPosition, hasSetVelocity, isFirstStep;
    ContextImpl* context;
    Context* owner;
    Kernel kernel;

--- a/plugins/rpmd/openmmapi/src/RPMDIntegrator.cpp
+++ b/plugins/rpmd/openmmapi/src/RPMDIntegrator.cpp
@@ -42,7 +42,7 @@ using std::string;
 using std::vector;
 RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictionCoeff, double stepSize) :
-        owner(NULL), numCopies(numCopies), forcesAreValid(false), hasSetPosition(false), hasSetVelocity(false) {
+        owner(NULL), numCopies(numCopies), forcesAreValid(false), hasSetPosition(false), hasSetVelocity(false), isFirstStep(true) {
    setTemperature(temperature);
    setFriction(frictionCoeff);
    setStepSize(stepSize);
@@ -109,6 +109,15 @@ void RPMDIntegrator::step(int steps) {
        for (int i = 0; i < numCopies; i++)
            setVelocities(i, s.getVelocities());
    }
+    if (isFirstStep) {
+        // Call setPositions() on the Context so it doesn't think the user is trying to
+        // run a simulation without setting positions first.  These positions will
+        // immediately get overwritten by the ones stored in this integrator.
+        vector<Vec3> p(context->getSystem().getNumParticles(), Vec3());
+        context->getOwner().setPositions(p);
+        isFirstStep = false;
+    }
    for (int i = 0; i < steps; ++i) {
        kernel.getAs<IntegrateRPMDStepKernel>().execute(*context, *this, forcesAreValid);
        forcesAreValid = true;

--- a/plugins/rpmd/platforms/cuda/CMakeLists.txt
+++ b/plugins/rpmd/platforms/cuda/CMakeLists.txt
+#---------------------------------------------------
+# OpenMM CUDA RPMD Integrator
+#
+# Creates OpenMM library, base name=OpenMMRPMDCUDA.
+# Default libraries are shared & optimized. Variants
+# are created for debug (_d).
+#
+# Windows:
+#   OpenMMRPMDCUDA[_d].dll
+#   OpenMMRPMDCUDA[_d].lib
+# Unix:
+#   libOpenMMRPMDCUDA[_d].so
+#----------------------------------------------------
+IF (APPLE)
+    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
+    SET (CMAKE_OSX_SYSROOT "/Developer/SDKs/MacOSX10.6.sdk")
+ENDIF (APPLE)
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+SET(OPENMMRPMDCUDA_LIBRARY_NAME OpenMMRPMDCUDA)
+SET(SHARED_TARGET ${OPENMMRPMDCUDA_LIBRARY_NAME})
+# Ensure that debug libraries have "_d" appended to their names.
+# CMake gets this right on Windows automatically with this definition.
+IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
+ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+# But on Unix or Cygwin we have to add the suffix manually
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_TARGET ${SHARED_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
+# and absolute pathnames.
+SET(API_REL_INCLUDE_FILES)   # start these out empty
+SET(API_ABS_INCLUDE_FILES)
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
+    FOREACH(pathname ${fullpaths})
+        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
+        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
+    ENDFOREACH(pathname)
+ENDFOREACH(dir)
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/include)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/platforms/cuda/src)
+# Set variables needed for encoding kernel sources into a C++ class
+SET(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+SET(CUDA_SOURCE_CLASS CudaRpmdKernelSources)
+SET(CUDA_KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.cpp)
+SET(CUDA_KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.h)
+SET(SOURCE_FILES ${SOURCE_FILES} ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H})
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
+# Create the library
+INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})
+FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
+ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
+    COMMAND ${CMAKE_COMMAND}
+    ARGS -D CUDA_SOURCE_DIR=${CUDA_SOURCE_DIR} -D CUDA_KERNELS_CPP=${CUDA_KERNELS_CPP} -D CUDA_KERNELS_H=${CUDA_KERNELS_H} -D CUDA_SOURCE_CLASS=${CUDA_SOURCE_CLASS} -P ${CMAKE_SOURCE_DIR}/platforms/cuda/EncodeCUDAFiles.cmake
+    DEPENDS ${CUDA_KERNELS}
+)
+SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
+ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
+ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB}  ${CUDA_LIBRARIES} ${PTHREADS_LIB})
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} debug ${OPENMM_LIBRARY_NAME}CUDA_d optimized ${OPENMM_LIBRARY_NAME}CUDA)
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} debug ${SHARED_RPMD_TARGET} optimized ${SHARED_RPMD_TARGET})
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
+INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)
+# Ensure that links to the main CUDA library will be resolved.
+IF (APPLE)
+    IF (CMAKE_BUILD_TYPE MATCHES Debug)
+        SET(CUDA_LIBRARY libOpenMMCUDA_d.dylib)
+    ELSE (CMAKE_BUILD_TYPE MATCHES Debug)
+        SET(CUDA_LIBRARY libOpenMMCUDA.dylib)
+    ENDIF (CMAKE_BUILD_TYPE MATCHES Debug)
+    INSTALL(CODE "EXECUTE_PROCESS(COMMAND install_name_tool -change ${CUDA_LIBRARY} @loader_path/${CUDA_LIBRARY} ${CMAKE_INSTALL_PREFIX}/lib/plugins/lib${SHARED_TARGET}.dylib)")
+ENDIF (APPLE)
+SUBDIRS (tests)
--- a/plugins/rpmd/platforms/cuda/include/CudaRpmdKernelFactory.h
+++ b/plugins/rpmd/platforms/cuda/include/CudaRpmdKernelFactory.h
+#ifndef OPENMM_CUDARPMDKERNELFACTORY_H_
+#define OPENMM_CUDARPMDKERNELFACTORY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/KernelFactory.h"
+namespace OpenMM {
+/**
+ * This KernelFactory creates kernels for the CUDA implementation of RPMDIntegrator.
+ */
+class CudaRpmdKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDARPMDKERNELFACTORY_H_*/
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelFactory.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                              OpenMMAmoeba                                  *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include <exception>
+#include "CudaRpmdKernelFactory.h"
+#include "CudaRpmdKernels.h"
+#include "openmm/internal/windowsExport.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+using namespace OpenMM;
+extern "C" void registerPlatforms() {
+}
+extern "C" void registerKernelFactories() {
+    try {
+        Platform& platform = Platform::getPlatformByName("CUDA");
+        CudaRpmdKernelFactory* factory = new CudaRpmdKernelFactory();
+        platform.registerKernelFactory(IntegrateRPMDStepKernel::Name(), factory);
+    }
+    catch (std::exception ex) {
+        // Ignore
+    }
+}
+KernelImpl* CudaRpmdKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    CudaContext& cl = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData())->contexts[0];
+    if (name == IntegrateRPMDStepKernel::Name())
+        return new CudaIntegrateRPMDStepKernel(name, platform, cl);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelSources.cpp.in
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelSources.cpp.in
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaRpmdKernelSources.h"
+using namespace OpenMM;
+using namespace std;
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelSources.h.in
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernelSources.h.in
+#ifndef OPENMM_CUDARPMDKERNELSOURCES_H_
+#define OPENMM_CUDARPMDKERNELSOURCES_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include <string>
+namespace OpenMM {
+/**
+ * This class is a central holding place for the source code of CUDA kernels.
+ * The CMake build script inserts declarations into it based on the .cu files in the
+ * kernels subfolder.
+ */
+class CudaRpmdKernelSources {
+public:
+@CUDA_FILE_DECLARATIONS@
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDARPMDKERNELSOURCES_H_*/
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CudaRpmdKernels.h"
+#include "CudaRpmdKernelSources.h"
+#include "openmm/internal/ContextImpl.h"
+#include "CudaIntegrationUtilities.h"
+#include "CudaExpressionUtilities.h"
+#include "CudaKernelSources.h"
+#include "CudaNonbondedUtilities.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+using namespace OpenMM;
+using namespace std;
+/**
+ * Select a size for an FFT that is a multiple of 2, 3, 5, and 7.
+ */
+static int findFFTDimension(int minimum) {
+    if (minimum < 1)
+        return 1;
+    while (true) {
+        // Attempt to factor the current value.
+        int unfactored = minimum;
+        for (int factor = 2; factor < 8; factor++) {
+            while (unfactored > 1 && unfactored%factor == 0)
+                unfactored /= factor;
+        }
+        if (unfactored == 1)
+            return minimum;
+        minimum++;
+    }
+}
+CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() {
+    if (forces != NULL)
+        delete forces;
+    if (positions != NULL)
+        delete positions;
+    if (velocities != NULL)
+        delete velocities;
+}
+void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
+    cu.getPlatformData().initializeContexts(system);
+    numCopies = integrator.getNumCopies();
+    numParticles = system.getNumParticles();
+    workgroupSize = numCopies;
+    while (workgroupSize <= 128-numCopies)
+        workgroupSize += numCopies;
+    if (numCopies != findFFTDimension(numCopies))
+        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
+    int paddedParticles = cu.getPaddedNumAtoms();
+    forces = CudaArray::create<long long>(cu, numCopies*paddedParticles*3, "rpmdForces");
+    positions = CudaArray::create<float4>(cu, numCopies*paddedParticles, "rpmdPositions");
+    velocities = CudaArray::create<float4>(cu, numCopies*paddedParticles, "rpmdVelocities");
+    cu.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
+    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
+    vector<float4> temp(positions->getSize());
+    for (int i = 0; i < positions->getSize(); i++)
+        temp[i] = make_float4(0, 0, 0, 0);
+    positions->upload(temp);
+    for (int i = 0; i < velocities->getSize(); i++)
+        temp[i] = make_float4(0, 0, 0, 1);
+    velocities->upload(temp);
+    // Create kernels.
+    map<string, string> defines;
+    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    defines["NUM_COPIES"] = cu.intToString(numCopies);
+    defines["THREAD_BLOCK_SIZE"] = cu.intToString(workgroupSize);
+    defines["HBAR"] = cu.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
+    defines["SCALE"] = cu.doubleToString(1.0/sqrt((double) numCopies));
+    defines["M_PI"] = cu.doubleToString(M_PI);
+    map<string, string> replacements;
+    replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
+    replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
+    replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
+    replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
+    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmd, replacements), defines, "");
+    pileKernel = cu.getKernel(module, "applyPileThermostat");
+    stepKernel = cu.getKernel(module, "integrateStep");
+    velocitiesKernel = cu.getKernel(module, "advanceVelocities");
+    copyToContextKernel = cu.getKernel(module, "copyToContext");
+    copyFromContextKernel = cu.getKernel(module, "copyFromContext");
+    translateKernel = cu.getKernel(module, "applyCellTranslations");
+}
+void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+    // Loop over copies and compute the force on each one.
+    if (!forcesAreValid)
+        computeForces(context);
+    // Apply the PILE-L thermostat.
+    double dt = integrator.getStepSize();
+    float dtFloat = (float) dt;
+    double kT = integrator.getTemperature()*BOLTZ;
+    float kTFloat = (float) kT;
+    double friction = integrator.getFriction();
+    float frictionFloat = (float) friction;
+    int randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
+    void* pileArgs[] = {&velocities->getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, &dtFloat, &kTFloat, &frictionFloat};
+    cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
+    // Update positions and velocities.
+    void* stepArgs[] = {&positions->getDevicePointer(), &velocities->getDevicePointer(), &forces->getDevicePointer(), &dtFloat, &kTFloat};
+    cu.executeKernel(stepKernel, stepArgs, numParticles*numCopies, workgroupSize);
+    // Calculate forces based on the updated positions.
+    computeForces(context);
+    // Update velocities.
+    void* velocitiesArgs[] = {&velocities->getDevicePointer(), &forces->getDevicePointer(), &dtFloat};
+    cu.executeKernel(velocitiesKernel, velocitiesArgs, numParticles*numCopies, workgroupSize);
+    // Apply the PILE-L thermostat again.
+    randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
+    cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
+    // Update the time and step count.
+    cu.setTime(cu.getTime()+dt);
+    cu.setStepCount(cu.getStepCount()+1);
+}
+void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
+    for (int i = 0; i < numCopies; i++) {
+        void* copyToContextArgs[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
+        cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
+        context.calcForcesAndEnergy(true, false);
+        void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
+        cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
+        if (cu.getAtomsWereReordered() && cu.getNonbondedUtilities().getUsePeriodic()) {
+            // Atoms may have been translated into a different periodic box, so apply
+            // the same translation to all the beads.
+            void* args[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
+            cu.executeKernel(translateKernel, args, cu.getNumAtoms());
+        }
+    }
+}
+double CudaIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
+    return cu.getIntegrationUtilities().computeKineticEnergy(0);
+}
+void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
+    if (positions == NULL)
+        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
+    if (pos.size() != numParticles)
+        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
+    vector<float4> posq(cu.getPaddedNumAtoms());
+    cu.getPosq().download(posq);
+    for (int i = 0; i < numParticles; i++)
+        posq[i] = make_float4((float) pos[i][0], (float) pos[i][1], (float) pos[i][2], posq[i].w);
+    CUresult result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &posq[0], numParticles*sizeof(float4));
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error uploading array "<<positions->getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
+    if (velocities == NULL)
+        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
+    if (vel.size() != numParticles)
+        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
+    vector<float4> velm(cu.getPaddedNumAtoms());
+    cu.getVelm().download(velm);
+    for (int i = 0; i < numParticles; i++)
+        velm[i] = make_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
+    CUresult result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &velm[0], numParticles*sizeof(float4));
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error uploading array "<<velocities->getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+void CudaIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
+    void* copyPositionsArgs[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy};
+    cu.executeKernel(copyToContextKernel, copyPositionsArgs, cu.getNumAtoms());
+    void* copyVelocitiesArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy};
+    cu.executeKernel(copyToContextKernel, copyVelocitiesArgs, cu.getNumAtoms());
+}
+string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
+    stringstream source;
+    int unfactored = size;
+    int stage = 0;
+    int L = size;
+    int m = 1;
+    string sign = (forward ? "1.0f" : "-1.0f");
+    string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj");
+    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");
+    source<<"{\n";
+    source<<"float3* real0 = "<<variable<<"real;\n";
+    source<<"float3* imag0 = "<<variable<<"imag;\n";
+    source<<"float3* real1 = &temp[blockStart];\n";
+    source<<"float3* imag1 = &temp[blockStart+blockDim.x];\n";
+    // Factor size, generating an appropriate block of code for each factor.
+    while (unfactored > 1) {
+        int input = stage%2;
+        int output = 1-input;
+        source<<"{\n";
+        if (unfactored%5 == 0) {
+            L = L/5;
+            source<<"// Pass "<<(stage+1)<<" (radix 5)\n";
+            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
+            source<<"int i = indexInBlock;\n";
+            source<<"int j = i/"<<m<<";\n";
+            source<<"float3 c0r = real"<<input<<"[i];\n";
+            source<<"float3 c0i = imag"<<input<<"[i];\n";
+            source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"float3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"float3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"float3 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
+            source<<"float3 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
+            source<<"float3 d0r = c1r+c4r;\n";
+            source<<"float3 d0i = c1i+c4i;\n";
+            source<<"float3 d1r = c2r+c3r;\n";
+            source<<"float3 d1i = c2i+c3i;\n";
+            source<<"float3 d2r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
+            source<<"float3 d2i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
+            source<<"float3 d3r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
+            source<<"float3 d3i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
+            source<<"float3 d4r = d0r+d1r;\n";
+            source<<"float3 d4i = d0i+d1i;\n";
+            source<<"float3 d5r = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
+            source<<"float3 d5i = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
+            source<<"float3 d6r = c0r-0.25f*d4r;\n";
+            source<<"float3 d6i = c0i-0.25f*d4i;\n";
+            source<<"float3 d7r = d6r+d5r;\n";
+            source<<"float3 d7i = d6i+d5i;\n";
+            source<<"float3 d8r = d6r-d5r;\n";
+            source<<"float3 d8i = d6i-d5i;\n";
+            string coeff = cu.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
+            source<<"float3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
+            source<<"float3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
+            source<<"float3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
+            source<<"float3 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
+            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
+            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
+            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
+            source<<"imag"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
+            source<<"real"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
+            source<<"imag"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
+            source<<"real"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
+            source<<"imag"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
+            source<<"real"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multReal<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
+            source<<"imag"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multImag<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
+            source<<"}\n";
+            m = m*5;
+            unfactored /= 5;
+        }
+        else if (unfactored%4 == 0) {
+            L = L/4;
+            source<<"// Pass "<<(stage+1)<<" (radix 4)\n";
+            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
+            source<<"int i = indexInBlock;\n";
+            source<<"int j = i/"<<m<<";\n";
+            source<<"float3 c0r = real"<<input<<"[i];\n";
+            source<<"float3 c0i = imag"<<input<<"[i];\n";
+            source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"float3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"float3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"float3 d0r = c0r+c2r;\n";
+            source<<"float3 d0i = c0i+c2i;\n";
+            source<<"float3 d1r = c0r-c2r;\n";
+            source<<"float3 d1i = c0i-c2i;\n";
+            source<<"float3 d2r = c1r+c3r;\n";
+            source<<"float3 d2i = c1i+c3i;\n";
+            source<<"float3 d3r = "<<sign<<"*(c1i-c3i);\n";
+            source<<"float3 d3i = "<<sign<<"*(c3r-c1r);\n";
+            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
+            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
+            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
+            source<<"imag"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
+            source<<"real"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
+            source<<"imag"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
+            source<<"real"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
+            source<<"imag"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
+            source<<"}\n";
+            m = m*4;
+            unfactored /= 4;
+        }
+        else if (unfactored%3 == 0) {
+            L = L/3;
+            source<<"// Pass "<<(stage+1)<<" (radix 3)\n";
+            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
+            source<<"int i = indexInBlock;\n";
+            source<<"int j = i/"<<m<<";\n";
+            source<<"float3 c0r = real"<<input<<"[i];\n";
+            source<<"float3 c0i = imag"<<input<<"[i];\n";
+            source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"float3 d0r = c1r+c2r;\n";
+            source<<"float3 d0i = c1i+c2i;\n";
+            source<<"float3 d1r = c0r-0.5f*d0r;\n";
+            source<<"float3 d1i = c0i-0.5f*d0i;\n";
+            source<<"float3 d2r = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
+            source<<"float3 d2i = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
+            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
+            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
+            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
+            source<<"imag"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
+            source<<"real"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
+            source<<"imag"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
+            source<<"}\n";
+            m = m*3;
+            unfactored /= 3;
+        }
+        else if (unfactored%2 == 0) {
+            L = L/2;
+            source<<"// Pass "<<(stage+1)<<" (radix 2)\n";
+            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
+            source<<"int i = indexInBlock;\n";
+            source<<"int j = i/"<<m<<";\n";
+            source<<"float3 c0r = real"<<input<<"[i];\n";
+            source<<"float3 c0i = imag"<<input<<"[i];\n";
+            source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
+            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
+            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
+            source<<"imag"<<output<<"[i+(j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
+            source<<"}\n";
+            m = m*2;
+            unfactored /= 2;
+        }
+        else
+            throw OpenMMException("Illegal size for FFT: "+cu.intToString(size));
+        source<<"__syncthreads();\n";
+        source<<"}\n";
+        ++stage;
+    }
+    // Create the kernel.
+    if (stage%2 == 1) {
+        source<<"real0[indexInBlock] = real1[indexInBlock];\n";
+        source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
+    }
+    source<<"}\n";
+    return source.str();
+}
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
+#ifndef CUDA_RPMD_KERNELS_H_
+#define CUDA_RPMD_KERNELS_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/RpmdKernels.h"
+#include "CudaContext.h"
+#include "CudaArray.h"
+namespace OpenMM {
+/**
+ * This kernel is invoked by RPMDIntegrator to take one time step, and to get and
+ * set the state of system copies.
+ */
+class CudaIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
+public:
+    CudaIntegrateRPMDStepKernel(std::string name, const Platform& platform, CudaContext& cu) :
+            IntegrateRPMDStepKernel(name, platform), cu(cu), forces(NULL), positions(NULL), velocities(NULL) {
+    }
+    ~CudaIntegrateRPMDStepKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param integrator the RPMDIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const RPMDIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param integrator     the RPMDIntegrator this kernel is being used for
+     * @param forcesAreValid if the context has been modified since the last time step, this will be
+     *                       false to show that cached forces are invalid and must be recalculated
+     */
+    void execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid);
+    /**
+     * Compute the kinetic energy.
+     * 
+     * @param context        the context in which to execute this kernel
+     * @param integrator     the RPMDIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator);
+    /**
+     * Get the positions of all particles in one copy of the system.
+     */
+    void setPositions(int copy, const std::vector<Vec3>& positions);
+    /**
+     * Get the velocities of all particles in one copy of the system.
+     */
+    void setVelocities(int copy, const std::vector<Vec3>& velocities);
+    /**
+     * Copy positions and velocities for one copy into the context.
+     */
+    void copyToContext(int copy, ContextImpl& context);
+private:
+    void computeForces(ContextImpl& context);
+    std::string createFFT(int size, const std::string& variable, bool forward);
+    CudaContext& cu;
+    int numCopies, numParticles, workgroupSize;
+    CudaArray* forces;
+    CudaArray* positions;
+    CudaArray* velocities;
+    CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+};
+} // namespace OpenMM
+#endif /*CUDA_RPMD_KERNELS_H_*/
--- a/plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
+++ b/plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
+__device__ float3 multiplyComplexRealPart(float2 c1, float3 c2r, float3 c2i) {
+    return c1.x*c2r-c1.y*c2i;
+}
+__device__ float3 multiplyComplexImagPart(float2 c1, float3 c2r, float3 c2i) {
+    return c1.x*c2i+c1.y*c2r;
+}
+__device__ float3 multiplyComplexRealPartConj(float2 c1, float3 c2r, float3 c2i) {
+    return c1.x*c2r+c1.y*c2i;
+}
+__device__ float3 multiplyComplexImagPartConj(float2 c1, float3 c2r, float3 c2i) {
+    return c1.x*c2i-c1.y*c2r;
+}
+/**
+ * Apply the PILE-L thermostat.
+ */
+extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, unsigned int randomIndex,
+        float dt, float kT, float friction) {
+    const int numBlocks = blockDim.x*gridDim.x/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
+    const int indexInBlock = threadIdx.x-blockStart;
+    const float nkT = NUM_COPIES*kT;
+    const float twown = 2.0f*nkT/HBAR;
+    const float c1_0 = EXP(-0.5f*dt*friction);
+    const float c2_0 = SQRT(1.0f-c1_0*c1_0);
+    __shared__ float3 v[2*THREAD_BLOCK_SIZE];
+    __shared__ float3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ float2 w[NUM_COPIES];
+    float3* vreal = &v[blockStart];
+    float3* vimag = &v[blockStart+blockDim.x];
+    if (threadIdx.x < NUM_COPIES)
+        w[indexInBlock] = make_float2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+    __syncthreads();
+    randomIndex += NUM_COPIES*((blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES);
+    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        float invMass = particleVelm.w;
+        float c3_0 = c2_0*SQRT(nkT*invMass);
+        // Forward FFT.
+        vreal[indexInBlock] = SCALE*make_float3(particleVelm.x, particleVelm.y, particleVelm.z);
+        vimag[indexInBlock] = make_float3(0);
+        __syncthreads();
+        FFT_V_FORWARD
+        // Apply the thermostat.
+        if (indexInBlock == 0) {
+            // Apply a local Langevin thermostat to the centroid mode.
+            float4 rand = random[randomIndex];
+            vreal[0] = vreal[0]*c1_0 + c3_0*make_float3(rand.x, rand.y, rand.z);
+        }
+        else {
+            // Use critical damping white noise for the remaining modes.
+            int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
+            const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
+            const float wk = twown*sin(k*M_PI/NUM_COPIES);
+            const float c1 = EXP(-wk*dt);
+            const float c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
+            const float c3 = c2*SQRT(nkT*invMass);
+            float4 rand1 = c3*random[randomIndex+k];
+            float4 rand2 = (isCenter ? make_float4(0) : c3*random[randomIndex+NUM_COPIES-k]);
+            vreal[indexInBlock] = c1*vreal[indexInBlock] + make_float3(rand1.x, rand1.y, rand1.z);
+            vimag[indexInBlock] = c1*vimag[indexInBlock] + (indexInBlock < NUM_COPIES/2 ? make_float3(rand2.x, rand2.y, rand2.z) : make_float3(-rand2.x, -rand2.y, -rand2.z));
+        }
+        __syncthreads();
+        // Inverse FFT.
+        FFT_V_BACKWARD
+        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
+        randomIndex += blockDim.x*gridDim.x;
+    }
+}
+/**
+ * Advance the positions and velocities.
+ */
+extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long* force, float dt, float kT) {
+    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
+    const int indexInBlock = threadIdx.x-blockStart;
+    const float nkT = NUM_COPIES*kT;
+    const float twown = 2.0f*nkT/HBAR;
+    const float forceScale = 1/(float) 0xFFFFFFFF;
+    __shared__ float3 q[2*THREAD_BLOCK_SIZE];
+    __shared__ float3 v[2*THREAD_BLOCK_SIZE];
+    __shared__ float3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ float2 w[NUM_COPIES];
+    // Update velocities.
+    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
+        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
+        float4 particleVelm = velm[index];
+        particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w);
+        particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w);
+        particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w);
+        velm[index] = particleVelm;
+    }
+    // Evolve the free ring polymer by transforming to the frequency domain.
+    float3* qreal = &q[blockStart];
+    float3* qimag = &q[blockStart+blockDim.x];
+    float3* vreal = &v[blockStart];
+    float3* vimag = &v[blockStart+blockDim.x];
+    if (threadIdx.x < NUM_COPIES)
+        w[indexInBlock] = make_float2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+    __syncthreads();
+    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        float4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
+        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        // Forward FFT.
+        qreal[indexInBlock] = SCALE*make_float3(particlePosq.x, particlePosq.y, particlePosq.z);
+        qimag[indexInBlock] = make_float3(0);
+        vreal[indexInBlock] = SCALE*make_float3(particleVelm.x, particleVelm.y, particleVelm.z);
+        vimag[indexInBlock] = make_float3(0);
+        __syncthreads();
+        FFT_Q_FORWARD
+        FFT_V_FORWARD
+        // Apply the thermostat.
+        if (indexInBlock == 0) {
+            qreal[0] += vreal[0]*dt;
+            qimag[0] += vimag[0]*dt;
+        }
+        else {
+            const float wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
+            const float wt = wk*dt;
+            const float coswt = cos(wt);
+            const float sinwt = sin(wt);
+            const float3 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
+            const float3 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
+            qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt
+            qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt;
+            vreal[indexInBlock] = vprimereal;
+            vimag[indexInBlock] = vprimeimag;
+        }
+        __syncthreads();
+        // Inverse FFT.
+        FFT_Q_BACKWARD
+        FFT_V_BACKWARD
+        posq[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*qreal[indexInBlock].x, SCALE*qreal[indexInBlock].y, SCALE*qreal[indexInBlock].z, particlePosq.w);
+        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
+    }
+}
+/**
+ * Advance the velocities by a half step.
+ */
+extern "C" __global__ void advanceVelocities(float4* velm, long long* force, float dt) {
+    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
+    const int indexInBlock = threadIdx.x-blockStart;
+    const float forceScale = 1/(float) 0xFFFFFFFF;
+    // Update velocities.
+    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
+        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
+        float4 particleVelm = velm[index];
+        particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w);
+        particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w);
+        particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w);
+        velm[index] = particleVelm;
+    }
+}
+/**
+ * Copy a set of per-atom values from the integrator's arrays to the context.
+ */
+extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, int copy) {
+    const int base = copy*PADDED_NUM_ATOMS;
+    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
+        dst[particle] = src[base+order[particle]];
+    }
+}
+/**
+ * Copy a set of per-atom force values from the context to the integrator's arrays.
+ */
+extern "C" __global__ void copyFromContext(long long* src, long long* dst, int* order, int copy) {
+    const int base = copy*PADDED_NUM_ATOMS*3;
+    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
+        dst[base+order[particle]] = src[particle];
+        dst[base+order[particle]+PADDED_NUM_ATOMS] = src[particle+PADDED_NUM_ATOMS];
+        dst[base+order[particle]+PADDED_NUM_ATOMS*2] = src[particle+PADDED_NUM_ATOMS*2];
+    }
+}
+/**
+ * Update atom positions so all copies are offset by the same number of periodic box widths.
+ */
+extern "C" __global__ void applyCellTranslations(float4* posq, float4* movedPos, int* order, int movedCopy) {
+    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
+        int index = order[particle];
+        float4 delta = movedPos[particle]-posq[movedCopy*PADDED_NUM_ATOMS+index];
+        for (int copy = 0; copy < NUM_COPIES; copy++)
+            posq[copy*PADDED_NUM_ATOMS+index] += delta;
+    }
+}
--- a/plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
+#
+# Testing
+#
+ENABLE_TESTING()
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIR})
+# Automatically create tests using files named "Test*.cpp"
+FILE(GLOB TEST_PROGS "*Test*.cpp")
+FOREACH(TEST_PROG ${TEST_PROGS})
+    GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)
+    # Link with shared library
+    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
+    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET})
+    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
+++ b/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CUDA implementation of RPMDIntegrator.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "openmm/CustomNonbondedForce.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/Platform.h"
+#include "openmm/System.h"
+#include "openmm/RPMDIntegrator.h"
+#include "SimTKUtilities/SimTKOpenMMUtilities.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+void testFreeParticles() {
+    const int numParticles = 100;
+    const int numCopies = 30;
+    const double temperature = 300.0;
+    const double mass = 1.0;
+    System system;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(mass);
+    RPMDIntegrator integ(numCopies, temperature, 10.0, 0.001);
+    Platform& platform = Platform::getPlatformByName("CUDA");
+    Context context(system, integ, platform);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numCopies; i++)
+    {
+        for (int j = 0; j < numParticles; j++)
+            positions[j] = Vec3(0.02*genrand_real2(sfmt), 0.02*genrand_real2(sfmt), 0.02*genrand_real2(sfmt));
+        integ.setPositions(i, positions);
+    }
+    const int numSteps = 1000;
+    integ.step(1000);
+    vector<double> ke(numCopies, 0.0);
+    vector<double> rg(numParticles, 0.0);
+    const RealOpenMM hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
+    for (int i = 0; i < numSteps; i++) {
+        integ.step(1);
+        vector<State> state(numCopies);
+        for (int j = 0; j < numCopies; j++)
+            state[j] = integ.getState(j, State::Positions | State::Velocities);
+        for (int j = 0; j < numParticles; j++) {
+            double rg2 = 0.0;
+            for (int k = 0; k < numCopies; k++) {
+                Vec3 v = state[k].getVelocities()[j];
+                ke[k] += 0.5*mass*v.dot(v);
+                for (int m = 0; m < numCopies; m++) {
+                    Vec3 delta = state[k].getPositions()[j]-state[m].getPositions()[j];
+                    rg2 += delta.dot(delta);
+                }
+            }
+            rg[j] += rg2/(2*numCopies*numCopies);
+        }
+    }
+    double meanKE = 0.0;
+    for (int i = 0; i < numCopies; i++)
+        meanKE += ke[i];
+    meanKE /= numSteps*numCopies;
+    double expectedKE = 0.5*numCopies*numParticles*3*BOLTZ*temperature;
+    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
+    double meanRg2 = 0.0;
+    for (int i = 0; i < numParticles; i++)
+        meanRg2 += rg[i];
+    meanRg2 /= numSteps*numParticles;
+    double expectedRg = hbar/(2*sqrt(mass*BOLTZ*temperature));
+    ASSERT_USUALLY_EQUAL_TOL(expectedRg, sqrt(meanRg2), 1e-3);
+}
+void testParaHydrogen() {
+    const int numParticles = 32;
+    const int numCopies = 12;
+    const double temperature = 25.0;
+    const double mass = 2.0;
+    const double boxSize = 1.1896;
+    const int numSteps = 1000;
+    const int numBins = 200;
+    const double reference[] = {
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 4.932814042206152e-5, 1.244331241336431e-4, 4.052316284060125e-4,
+        1.544810863683946e-3, 4.376197806690222e-3, 1.025847561714293e-2, 2.286702037465422e-2,
+        4.371052180263602e-2, 7.518538770734748e-2, 0.122351534531647, 0.185758975626622,
+        0.266399984652322, 0.363380262153250, 0.473696401293219, 0.595312098494172,
+        0.726049519422861, 0.862264551954547, 0.991102029379444, 1.1147503922535,
+        1.23587006992066, 1.33495411932817, 1.42208208736987, 1.49273884004107,
+        1.54633319690403, 1.58714702233941, 1.60439217751355, 1.61804190608902,
+        1.60680198476058, 1.58892222973695, 1.56387607986781, 1.52629494593350,
+        1.48421439018970, 1.43656176771959, 1.38752775598872, 1.33310695719931,
+        1.28363477223121, 1.23465642750248, 1.18874848666326, 1.14350496170519,
+        1.10292486009936, 1.06107270157688, 1.02348927970441, 0.989729345271297,
+        0.959273446941802, 0.932264875865758, 0.908818658748942, 0.890946420768315,
+        0.869332737718165, 0.856401736350349, 0.842370069917020, 0.834386614237393,
+        0.826268072171045, 0.821547250199453, 0.818786865315836, 0.819441757028076,
+        0.819156933383128, 0.822275325148621, 0.828919078023881, 0.837233720599450,
+        0.846961908186718, 0.855656955481099, 0.864520333201247, 0.876082425547566,
+        0.886950044046000, 0.900275658318995
+    };
+    // Create a box of para-hydrogen.
+    System system;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(mass);
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize,0,0), Vec3(0,boxSize,0), Vec3(0,0,boxSize));
+    CustomNonbondedForce* nb = new CustomNonbondedForce("2625.49963*(exp(1.713-1.5671*p-0.00993*p*p)-(12.14/p^6+215.2/p^8-143.1/p^9+4813.9/p^10)*(step(rc-p)*exp(-(rc/p-1)^2)+1-step(rc-p))); p=r/0.05291772108; rc=8.32");
+    nb->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
+    nb->setCutoffDistance(boxSize/2);
+    vector<double> params;
+    for (int i = 0; i < numParticles; i++)
+        nb->addParticle(params);
+    system.addForce(nb);
+    RPMDIntegrator integ(numCopies, temperature, 10.0, 0.0005);
+    Platform& platform = Platform::getPlatformByName("CUDA");
+    Context context(system, integ, platform);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+    for (int i = 0; i < numCopies; i++)
+        integ.setPositions(i, positions);
+    integ.step(1000);
+    // Simulate it.
+    vector<int> counts(numBins, 0);
+    const double invBoxSize = 1.0/boxSize;
+    double meanKE = 0.0;
+    const RealOpenMM hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
+    for (int step = 0; step < numSteps; step++) {
+        integ.step(20);
+        vector<State> states(numCopies);
+        for (int i = 0; i < numCopies; i++)
+            states[i] = integ.getState(i, State::Positions | State::Forces);
+        // Record the radial distribution function.
+        const vector<Vec3>& pos = states[0].getPositions();
+        for (int j = 0; j < numParticles; j++)
+            for (int k = 0; k < j; k++) {
+                Vec3 delta = pos[j]-pos[k];
+                delta[0] -= floor(delta[0]*invBoxSize+0.5)*boxSize;
+                delta[1] -= floor(delta[1]*invBoxSize+0.5)*boxSize;
+                delta[2] -= floor(delta[2]*invBoxSize+0.5)*boxSize;
+                double dist = sqrt(delta.dot(delta));
+                int bin = (int) (numBins*(dist/boxSize));
+                counts[bin]++;
+            }
+        // Calculate the quantum contribution to the kinetic energy.
+        vector<Vec3> centroids(numParticles, Vec3());
+        for (int i = 0; i < numCopies; i++) {
+            const vector<Vec3>& pos = states[i].getPositions();
+            for (int j = 0; j < numParticles; j++)
+                centroids[j] += pos[j];
+        }
+        for (int j = 0; j < numParticles; j++)
+            centroids[j] *= 1.0/numCopies;
+        double ke = 0.0;
+        for (int i = 0; i < numCopies; i++) {
+            const vector<Vec3>& pos = states[i].getPositions();
+            const vector<Vec3>& f = states[i].getForces();
+            for (int j = 0; j < numParticles; j++) {
+                Vec3 delta = centroids[j]-pos[j];
+                ke += delta.dot(f[j]);
+            }
+        }
+        meanKE += ke/(2*numCopies*numParticles);
+    }
+    // Check against expected values.
+    double scale = (boxSize*boxSize*boxSize)/(numSteps*0.5*numParticles*numParticles);
+    for (int i = 0; i < numBins/2; i++) {
+        double r1 = i*boxSize/numBins;
+        double r2 = (i+1)*boxSize/numBins;
+        double volume = (4.0/3.0)*M_PI*(r2*r2*r2-r1*r1*r1);
+        ASSERT_USUALLY_EQUAL_TOL(reference[i], scale*counts[i]/volume, 0.1);
+    }
+    meanKE /= numSteps*BOLTZ;
+    ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02);
+}
+int main() {
+    try {
+        Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory());
+        testFreeParticles();
+        testParaHydrogen();
+    }
+    catch(const std::exception& e) {
+        std::cout << "exception: " << e.what() << std::endl;
+        std::cout << "FAIL - ERROR.  Test failed." << std::endl;
+        return 1;
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}
--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
@@ -98,10 +98,7 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
 }
 void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
-    const System& system = context.getSystem();
-    const int paddedParticles = cl.getPaddedNumAtoms();
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
        pileKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());

--- a/plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
+++ b/plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
@@ -107,7 +107,6 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        float4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float invMass = particleVelm.w;
        // Forward FFT.
@@ -130,7 +129,6 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
            const float wt = wk*dt;
            const float coswt = cos(wt);
            const float sinwt = sin(wt);
-            const float wm = wk/particleVelm.w;
            const float4 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
            const float4 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
            qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt