Very early beginnings of new CUDA platform

abb8cb4b · Peter Eastman · 18501459 · abb8cb4b · abb8cb4b · abb8cb4b
Commit abb8cb4b authored Jun 04, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda2/CMakeLists.txt
+++ b/platforms/cuda2/CMakeLists.txt
+#---------------------------------------------------
+# OpenMM CUDA Platform
+#
+# Creates OpenMM library, base name=OpenMMCUDA.
+# Default libraries are shared & optimized. Variants
+# are created for static (_static) and debug (_d).
+#
+# Windows:
+#   OpenMMCUDA[_d].dll
+#   OpenMMCUDA[_d].lib
+#   OpenMMCUDA_static[_d].lib
+# Unix:
+#   libOpenMMCUDA[_d].so
+#   libOpenMMCUDA_static[_d].a
+#----------------------------------------------------
+IF (APPLE)
+    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
+    SET (CMAKE_OSX_SYSROOT "/Developer/SDKs/MacOSX10.6.sdk")
+ENDIF (APPLE)
+set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
+if(OPENMM_BUILD_CUDA_TESTS)
+    SUBDIRS (tests)
+endif(OPENMM_BUILD_CUDA_TESTS)
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+SET(OPENMMCUDA_LIBRARY_NAME OpenMMCUDA)
+SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
+SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
+# Ensure that debug libraries have "_d" appended to their names.
+# CMake gets this right on Windows automatically with this definition.
+IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
+ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+# But on Unix or Cygwin we have to add the suffix manually
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_TARGET ${SHARED_TARGET}_d)
+    SET(STATIC_TARGET ${STATIC_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
+# and absolute pathnames.
+SET(API_REL_INCLUDE_FILES)   # start these out empty
+SET(API_ABS_INCLUDE_FILES)
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
+    FOREACH(pathname ${fullpaths})
+        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
+        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
+    ENDFOREACH(pathname)
+ENDFOREACH(dir)
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+# Set variables needed for encoding kernel sources into a C++ class
+SET(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+SET(CUDA_SOURCE_CLASS CudaKernelSources)
+SET(CUDA_KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.cpp)
+SET(CUDA_KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.h)
+SET(SOURCE_FILES ${SOURCE_FILES} ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H})
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
+SUBDIRS (sharedTarget)
--- a/platforms/cuda2/EncodeCUDAFiles.cmake
+++ b/platforms/cuda2/EncodeCUDAFiles.cmake
+FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
+SET(CUDA_FILE_DECLARATIONS)
+SET(CUDA_FILE_DEFINITIONS)
+CONFIGURE_FILE(${CUDA_SOURCE_DIR}/${CUDA_SOURCE_CLASS}.cpp.in ${CUDA_KERNELS_CPP})
+FOREACH(file ${CUDA_KERNELS})
+    # Load the file contents and process it.
+    FILE(STRINGS ${file} file_content NEWLINE_CONSUME)
+    # Replace all backslashes by double backslashes as they are being put in a C string.
+    # Be careful not to replace the backslash before a semicolon as that is the CMAKE
+    # internal escaping of a semicolon to prevent it from acting as a list seperator.
+    STRING(REGEX REPLACE "\\\\([^;])" "\\\\\\\\\\1" file_content "${file_content}")
+    # Escape double quotes as being put in a C string.
+    STRING(REPLACE "\"" "\\\"" file_content "${file_content}")
+    # Split in separate C strings for each line.
+    STRING(REPLACE "\n" "\\n\"\n\"" file_content "${file_content}")
+    # Determine a name for the variable that will contain this file's contents
+    FILE(RELATIVE_PATH filename ${CUDA_SOURCE_DIR}/kernels ${file})
+    STRING(LENGTH ${filename} filename_length)
+    MATH(EXPR filename_length ${filename_length}-3)
+    STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name)
+    # Record the variable declaration and definition.
+    SET(CUDA_FILE_DECLARATIONS ${CUDA_FILE_DECLARATIONS}static\ const\ std::string\ ${variable_name};\n)
+    FILE(APPEND ${CUDA_KERNELS_CPP} const\ string\ ${CUDA_SOURCE_CLASS}::${variable_name}\ =\ \"${file_content}\"\;\n)
+ENDFOREACH(file)
+CONFIGURE_FILE(${CUDA_SOURCE_DIR}/${CUDA_SOURCE_CLASS}.h.in ${CUDA_KERNELS_H})
--- a/platforms/cuda2/include/CudaKernelFactory.h
+++ b/platforms/cuda2/include/CudaKernelFactory.h
+#ifndef OPENMM_CUDAKERNELFACTORY_H_
+#define OPENMM_CUDAKERNELFACTORY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/KernelFactory.h"
+namespace OpenMM {
+/**
+ * This KernelFactory creates all kernels for CudaPlatform.
+ */
+class CudaKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAKERNELFACTORY_H_*/
--- a/platforms/cuda2/include/CudaPlatform.h
+++ b/platforms/cuda2/include/CudaPlatform.h
+#ifndef OPENMM_CUDAPLATFORM_H_
+#define OPENMM_CUDAPLATFORM_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/Platform.h"
+#include "openmm/System.h"
+namespace OpenMM {
+class CudaContext;
+/**
+ * This Platform subclass uses CUDA implementations of the OpenMM kernels.
+ */
+class OPENMM_EXPORT CudaPlatform : public Platform {
+public:
+    class PlatformData;
+    CudaPlatform();
+    const std::string& getName() const {
+        static const std::string name = "CUDA";
+        return name;
+    }
+    double getSpeed() const {
+        return 100;
+    }
+    bool supportsDoublePrecision() const;
+    const std::string& getPropertyValue(const Context& context, const std::string& property) const;
+    void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
+    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
+    void contextDestroyed(ContextImpl& context) const;
+    /**
+     * This is the name of the parameter for selecting which CUDA device or devices to use.
+     */
+    static const std::string& CudaDeviceIndex() {
+        static const std::string key = "CudaDeviceIndex";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
+     */
+    static const std::string& CudaUseBlockingSync() {
+        static const std::string key = "CudaUseBlockingSync";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for selecting what numerical precision to use.
+     */
+    static const std::string& CudaPrecision() {
+        static const std::string key = "CudaPrecision";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for specifying the path to the CUDA compiler.
+     */
+    static const std::string& CudaCompiler() {
+        static const std::string key = "CudaCompiler";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for specifying the path to the directory for creating temporary files.
+     */
+    static const std::string& CudaTempDirectory() {
+        static const std::string key = "CudaTempDirectory";
+        return key;
+    }
+};
+class OPENMM_EXPORT CudaPlatform::PlatformData {
+public:
+    PlatformData(const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
+            const std::string& compilerProperty, const std::string& tempProperty);
+    ~PlatformData();
+    void initializeContexts(const System& system);
+    void syncContexts();
+    std::vector<CudaContext*> contexts;
+    std::vector<double> contextEnergy;
+    bool removeCM;
+    int cmMotionFrequency;
+    int stepCount, computeForceCount;
+    double time;
+    std::map<std::string, std::string> propertyValues;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAPLATFORM_H_*/
--- a/platforms/cuda2/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda2/sharedTarget/CMakeLists.txt
+#
+# Include CUDA related files.
+#
+INCLUDE(FindCUDA)
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
+ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
+    COMMAND ${CMAKE_COMMAND}
+    ARGS -D CUDA_SOURCE_DIR=${CUDA_SOURCE_DIR} -D CUDA_KERNELS_CPP=${CUDA_KERNELS_CPP} -D CUDA_KERNELS_H=${CUDA_KERNELS_H} -D CUDA_SOURCE_CLASS=${CUDA_SOURCE_CLASS} -P ${CMAKE_CURRENT_SOURCE_DIR}/../EncodeCUDAFiles.cmake
+    DEPENDS ${CUDA_KERNELS}
+)
+SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
+ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
+ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${PTHREADS_LIB})
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
+INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaArray.h"
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace OpenMM;
+CudaArray::CudaArray(int size, int elementSize, const std::string& name) :
+        size(size), elementSize(elementSize), name(name), ownsMemory(true) {
+    CUresult result = cuMemAlloc(&pointer, size*elementSize);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error creating array "<<name<<": "<<result;
+        throw OpenMMException(str.str());
+    }
+}
+CudaArray::~CudaArray() {
+    if (ownsMemory) {
+        CUresult result = cuMemFree(pointer);
+        if (result != CUDA_SUCCESS) {
+            std::stringstream str;
+            str<<"Error deleting array "<<name<<": "<<result;
+            throw OpenMMException(str.str());
+        }
+    }
+}
+void CudaArray::upload(void* data, bool blocking) {
+    CUresult result;
+    if (blocking)
+        result = cuMemcpyHtoD(pointer, data, size*elementSize);
+    else
+        result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, 0);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error uploading array "<<name<<": "<<result;
+        throw OpenMMException(str.str());
+    }
+}
+void CudaArray::download(void* data, bool blocking) const {
+    CUresult result;
+    if (blocking)
+        result = cuMemcpyDtoH(data, pointer, size*elementSize);
+    else
+        result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, 0);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error downloading array "<<name<<": "<<result;
+        throw OpenMMException(str.str());
+    }
+}
--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
+#ifndef OPENMM_CUDAARRAY_H_
+#define OPENMM_CUDAARRAY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/OpenMMException.h"
+#include <cuda.h>
+#include <iostream>
+#include <sstream>
+#include <vector>
+namespace OpenMM {
+/**
+ * This class encapsulates a block of CUDA device memory.  It provides a simplified API
+ * for working with it and for copying data to and from device memory.
+ */
+class CudaArray {
+public:
+    /**
+     * Create a CudaArray object.  The object is allocated on the heap with the "new" operator.
+     * The template argument is the data type of each array element.
+     *
+     * @param size              the number of elements in the array
+     * @param name              the name of the array
+     */
+    template <class T>
+    static CudaArray* create(int size, const std::string& name) {
+        return new CudaArray(size, sizeof(T), name);
+    }
+    /**
+     * Create a CudaArray object.
+     *
+     * @param size              the number of elements in the array
+     * @param elementSize       the size of each element in bytes
+     * @param name              the name of the array
+     */
+    CudaArray(int size, int elementSize, const std::string& name);
+    ~CudaArray();
+    /**
+     * Get the number of elements in the array.
+     */
+    int getSize() const {
+        return size;
+    }
+    /**
+     * Get the size of each element in bytes.
+     */
+    int getElementSize() const {
+        return elementSize;
+    }
+    /**
+     * Get the name of the array.
+     */
+    const std::string& getName() const {
+        return name;
+    }
+    /**
+     * Get a pointer to the device memory.
+     */
+    CUdeviceptr getDevicePointer() {
+        return pointer;
+    }
+    /**
+     * Copy the values in a vector to the device memory.
+     */
+    template <class T>
+    void upload(std::vector<T>& data) {
+        if (sizeof(T) != elementSize || data.size() != size)
+            throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
+        upload(&data[0], true);
+    }
+    /**
+     * Copy the values in the Buffer to a vector.
+     */
+    template <class T>
+    void download(std::vector<T>& data) const {
+        if (sizeof(T) != elementSize)
+            throw OpenMMException("Error downloading array "+name+": The specified vector has the wrong element size");
+        if (data.size() != size)
+            data.resize(size);
+        download(&data[0], true);
+    }
+    /**
+     * Copy the values in an array to the device memory.
+     * 
+     * @param data     the data to copy
+     * @param blocking if true, this call will block until the transfer is complete.  If false,
+     *                 the source array  must be in page-locked memory.
+     */
+    void upload(void* data, bool blocking = true);
+    /**
+     * Copy the values in the device memory to an array.
+     * 
+     * @param data     the array to copy the memory to
+     * @param blocking if true, this call will block until the transfer is complete.  If false,
+     *                 the destination array must be in page-locked memory.
+     */
+    void download(void* data, bool blocking = true) const;
+private:
+    CUdeviceptr pointer;
+    int size, elementSize;
+    bool ownsMemory;
+    std::string name;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAARRAY_H_*/
--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#ifdef WIN32
+  #define _USE_MATH_DEFINES // Needed to get M_PI
+#endif
+#include <cmath>
+#include "CudaContext.h"
+#include "CudaArray.h"
+//#include "CudaBondedUtilities.h"
+#include "CudaExpressionUtilities.h"
+#include "CudaForceInfo.h"
+//#include "CudaIntegrationUtilities.h"
+#include "CudaKernelSources.h"
+//#include "CudaNonbondedUtilities.h"
+#include "hilbert.h"
+#include "openmm/OpenMMException.h"
+#include "openmm/Platform.h"
+#include "openmm/System.h"
+#include "openmm/VirtualSite.h"
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <typeinfo>
+#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
+#define CHECK_RESULT2(result, prefix) \
+    if (result != CUDA_SUCCESS) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<result<<" ("<<__FILE__<<": "<<__LINE__<<")"; \
+        throw OpenMMException(m.str());\
+    }
+using namespace OpenMM;
+using namespace std;
+const int CudaContext::ThreadBlockSize = 64;
+const int CudaContext::TileSize = 32;
+bool CudaContext::hasInitializedCuda = false;
+CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
+        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
+        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), posq(NULL),
+        velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
+        bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
+    if (!hasInitializedCuda) {
+        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
+        hasInitializedCuda = true;
+    }
+    if (precision == "single") {
+        useDoublePrecision = false;
+        accumulateInDouble = false;
+    }
+    else if (precision == "mixed") {
+        useDoublePrecision = false;
+        accumulateInDouble = true;
+    }
+    else if (precision == "double") {
+        useDoublePrecision = true;
+        accumulateInDouble = true;
+    }
+    else
+        throw OpenMMException("Illegal value for CudaPrecision: "+precision);
+#ifdef WIN32
+    this->tempDir = tempDir+"\";
+#else
+    this->tempDir = tempDir+"/";
+#endif
+    contextIndex = platformData.contexts.size();
+    int numDevices;
+    string errorMessage = "Error initializing Context";
+    CHECK_RESULT(cuDeviceGetCount(&numDevices));
+    if (deviceIndex < 0 || deviceIndex >= numDevices) {
+        // Try to figure out which device is the fastest.
+        int bestSpeed = -1;
+        int bestCompute = -1;
+        for (int i = 0; i < numDevices; i++) {
+            CHECK_RESULT(cuDeviceGet(&device, i));
+            int major, minor, clock, multiprocessors;
+            CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
+            if (major == 1 && minor < 2)
+                continue; // 1.0 and 1.1 are not supported
+            CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
+            CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+            int speed = clock*multiprocessors;
+            if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
+                deviceIndex = i;
+                bestSpeed = speed;
+                bestCompute = major;
+            }
+        }
+    }
+    if (deviceIndex == -1)
+        throw OpenMMException("No compatible CUDA device is available");
+    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
+    this->deviceIndex = deviceIndex;
+    int major, minor;
+    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
+    gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
+    compilationDefines["WORK_GROUP_SIZE"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
+    defaultOptimizationOptions = "--use_fast_math";
+    int numThreadBlocksPerComputeUnit = 6;
+    CHECK_RESULT(cuCtxCreate(&context, 0, device));
+    contextIsValid = true;
+    numAtoms = system.getNumParticles();
+    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
+    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
+    int multiprocessors;
+    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
+//    bonded = new CudaBondedUtilities(*this);
+//    nonbonded = new CudaNonbondedUtilities(*this);
+    posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
+    velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
+    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
+    // Create utility kernels that are used in multiple places.
+    CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
+    cuModuleGetFunction(&clearBufferKernel, utilities, "clearBuffer");
+    cuModuleGetFunction(&clearTwoBuffersKernel, utilities, "clearTwoBuffers");
+    cuModuleGetFunction(&clearThreeBuffersKernel, utilities, "clearThreeBuffers");
+    cuModuleGetFunction(&clearFourBuffersKernel, utilities, "clearFourBuffers");
+    cuModuleGetFunction(&clearFiveBuffersKernel, utilities, "clearFiveBuffers");
+    cuModuleGetFunction(&clearSixBuffersKernel, utilities, "clearSixBuffers");
+    cuModuleGetFunction(&reduceFloat4Kernel, utilities, "reduceFloat4Buffer");
+    cuModuleGetFunction(&reduceForcesKernel, utilities, "reduceForces");
+    // Set defines based on the requested precision.
+    compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf";
+    compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf";
+    compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
+    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
+    compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
+    // Create the work thread used for parallelization when running on multiple devices.
+    thread = new WorkThread();
+//    
+//    // Create the integration utilities object.
+//    
+//    integration = new CudaIntegrationUtilities(*this, system);
+}
+CudaContext::~CudaContext() {
+    for (int i = 0; i < (int) forces.size(); i++)
+        delete forces[i];
+    for (int i = 0; i < (int) reorderListeners.size(); i++)
+        delete reorderListeners[i];
+    if (posq != NULL)
+        delete posq;
+    if (velm != NULL)
+        delete velm;
+//    if (force != NULL)
+//        delete force;
+//    if (forceBuffers != NULL)
+//        delete forceBuffers;
+//    if (longForceBuffer != NULL)
+//        delete longForceBuffer;
+//    if (energyBuffer != NULL)
+//        delete energyBuffer;
+//    if (atomIndex != NULL)
+//        delete atomIndex;
+//    if (integration != NULL)
+//        delete integration;
+//    if (bonded != NULL)
+//        delete bonded;
+//    if (nonbonded != NULL)
+//        delete nonbonded;
+    if (thread != NULL)
+        delete thread;
+    string errorMessage = "Error deleting Context";
+    if (contextIsValid)
+        CHECK_RESULT(cuCtxDestroy(context));
+}
+//void CudaContext::initialize() {
+//    for (int i = 0; i < numAtoms; i++) {
+//        double mass = system.getParticleMass(i);
+//        (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
+//    }
+//    velm->upload();
+//    bonded->initialize(system);
+//    numForceBuffers = platformData.contexts.size();
+//    numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
+//    for (int i = 0; i < (int) forces.size(); i++)
+//        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
+//    forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
+//    if (supports64BitGlobalAtomics) {
+//        longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer", false);
+//        reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
+//        reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
+//        reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
+//        reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
+//        addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
+//    }
+//    addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
+//    force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
+//    energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer", true);
+//    addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
+//    atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
+//    for (int i = 0; i < paddedNumAtoms; ++i)
+//        (*atomIndex)[i] = i;
+//    atomIndex->upload();
+//    findMoleculeGroups();
+//    moleculesInvalid = false;
+//    nonbonded->initialize(system);
+//}
+void CudaContext::addForce(CudaForceInfo* force) {
+    forces.push_back(force);
+}
+string CudaContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
+    string result = input;
+    for (map<string, string>::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {
+        int index = -1;
+        do {
+            index = result.find(iter->first);
+            if (index != result.npos)
+                result.replace(index, iter->first.size(), iter->second);
+        } while (index != result.npos);
+    }
+    return result;
+}
+CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) {
+    return createModule(source, map<string, string>(), optimizationFlags);
+}
+CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
+    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
+    stringstream src;
+    if (!options.empty())
+        src << "// Compilation Options: " << options << endl << endl;
+    for (map<string, string>::const_iterator iter = compilationDefines.begin(); iter != compilationDefines.end(); ++iter) {
+        src << "#define " << iter->first;
+        if (!iter->second.empty())
+            src << " " << iter->second;
+        src << endl;
+    }
+    if (!compilationDefines.empty())
+        src << endl;
+    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
+        src << "#define " << iter->first;
+        if (!iter->second.empty())
+            src << " " << iter->second;
+        src << endl;
+    }
+    if (!defines.empty())
+        src << endl;
+    src << source << endl;
+    // Write out the source to a temporary file.
+    stringstream tempFileName;
+    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
+    string inputFile = (tempDir+tempFileName.str()+".cu");
+    string outputFile = (tempDir+tempFileName.str()+".ptx");
+    string logFile = (tempDir+tempFileName.str()+".log");
+    ofstream out(inputFile.c_str());
+    out << src.str();
+    out.close();
+#ifdef WIN32
+#else
+    string command = "\""+compiler+"\" --ptx -arch=compute_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
+    int res = std::system(command.c_str());
+#endif
+    try {
+        if (res != 0) {
+            // Load the error log.
+            stringstream error;
+            error << "Error launching CUDA compiler: " << res;
+            ifstream log(logFile.c_str());
+            if (log.is_open()) {
+                string line;
+                while (!log.eof()) {
+                    getline(log, line);
+                    error << '\n' << line;
+                }
+                log.close();
+            }
+            throw OpenMMException(error.str());
+        }
+        CUmodule module;
+        CUresult result = cuModuleLoad(&module, outputFile.c_str());
+        if (result != CUDA_SUCCESS) {
+            std::stringstream m;
+            m<<"Error loading CUDA module: "<<result;
+            throw OpenMMException(m.str());
+        }
+        remove(inputFile.c_str());
+        remove(outputFile.c_str());
+        remove(logFile.c_str());
+        return module;
+    }
+    catch (...) {
+        remove(inputFile.c_str());
+        remove(outputFile.c_str());
+        remove(logFile.c_str());
+        throw;
+    }
+//    
+//    // Get length before using c_str() to avoid length() call invalidating the c_str() value.
+//    string src_string = src.str();
+//    ::size_t src_length = src_string.length();
+//    cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
+//    cl::Program program(context, sources);
+//    try {
+//        program.build(vector<cl::Device>(1, device), options.c_str());
+//    } catch (cl::Error err) {
+//        throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
+//    }
+}
+//
+//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
+//    if (blockSize == -1)
+//        blockSize = ThreadBlockSize;
+//    int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
+//    try {
+//        queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
+//    }
+//    catch (cl::Error err) {
+//        stringstream str;
+//        str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
+//        throw OpenMMException(str.str());
+//    }
+//}
+//
+//void CudaContext::clearBuffer(CudaArray<float>& array) {
+//    clearBuffer(array.getDeviceBuffer(), array.getSize());
+//}
+//
+//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) {
+//    clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
+//}
+//
+//void CudaContext::clearBuffer(cl::Memory& memory, int size) {
+//    clearBufferKernel.setArg<cl::Memory>(0, memory);
+//    clearBufferKernel.setArg<cl_int>(1, size);
+//    executeKernel(clearBufferKernel, size, 128);
+//}
+//
+//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) {
+//    autoclearBuffers.push_back(&memory);
+//    autoclearBufferSizes.push_back(size);
+//}
+//
+//void CudaContext::clearAutoclearBuffers() {
+//    int base = 0;
+//    int total = autoclearBufferSizes.size();
+//    while (total-base >= 6) {
+//        clearSixBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+//        clearSixBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+//        clearSixBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+//        clearSixBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+//        clearSixBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+//        clearSixBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+//        clearSixBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
+//        clearSixBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
+//        clearSixBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
+//        clearSixBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
+//        clearSixBuffersKernel.setArg<cl::Memory>(10, *autoclearBuffers[base+5]);
+//        clearSixBuffersKernel.setArg<cl_int>(11, autoclearBufferSizes[base+5]);
+//        executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
+//        base += 6;
+//    }
+//    if (total-base == 5) {
+//        clearFiveBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+//        clearFiveBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+//        clearFiveBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+//        clearFiveBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+//        clearFiveBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+//        clearFiveBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+//        clearFiveBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
+//        clearFiveBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
+//        clearFiveBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
+//        clearFiveBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
+//        executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
+//    }
+//    else if (total-base == 4) {
+//        clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+//        clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+//        clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+//        clearFourBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+//        clearFourBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+//        clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+//        clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
+//        clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
+//        executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
+//    }
+//    else if (total-base == 3) {
+//        clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+//        clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+//        clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+//        clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+//        clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+//        clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+//        executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
+//    }
+//    else if (total-base == 2) {
+//        clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+//        clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+//        clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+//        clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+//        executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
+//    }
+//    else if (total-base == 1) {
+//        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
+//    }
+//}
+//
+//void CudaContext::reduceForces() {
+//    if (supports64BitGlobalAtomics)
+//        executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
+//    else
+//        reduceBuffer(*forceBuffers, numForceBuffers);
+//}
+//
+//void CudaContext::reduceBuffer(CudaArray<mm_float4>& array, int numBuffers) {
+//    int bufferSize = array.getSize()/numBuffers;
+//    reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
+//    reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
+//    reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
+//    executeKernel(reduceFloat4Kernel, bufferSize, 128);
+//}
+//
+//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
+//    // Recursively tag atoms as belonging to a particular molecule.
+//
+//    atomMolecule[atom] = molecule;
+//    for (int i = 0; i < (int) atomBonds[atom].size(); i++)
+//        if (atomMolecule[atomBonds[atom][i]] == -1)
+//            tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
+//}
+//
+///**
+// * This class ensures that atom reordering doesn't break virtual sites.
+// */
+//class CudaContext::VirtualSiteInfo : public CudaForceInfo {
+//public:
+//    VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
+//        for (int i = 0; i < system.getNumParticles(); i++) {
+//            if (system.isVirtualSite(i)) {
+//                siteTypes.push_back(&typeid(system.getVirtualSite(i)));
+//                vector<int> particles;
+//                particles.push_back(i);
+//                for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
+//                    particles.push_back(system.getVirtualSite(i).getParticle(j));
+//                siteParticles.push_back(particles);
+//                vector<double> weights;
+//                if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+//                    // A two particle average.
+//
+//                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
+//                    weights.push_back(site.getWeight(0));
+//                    weights.push_back(site.getWeight(1));
+//                }
+//                else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+//                    // A three particle average.
+//
+//                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
+//                    weights.push_back(site.getWeight(0));
+//                    weights.push_back(site.getWeight(1));
+//                    weights.push_back(site.getWeight(2));
+//                }
+//                else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
+//                    // An out of plane site.
+//
+//                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
+//                    weights.push_back(site.getWeight12());
+//                    weights.push_back(site.getWeight13());
+//                    weights.push_back(site.getWeightCross());
+//                }
+//                siteWeights.push_back(weights);
+//            }
+//        }
+//    }
+//    int getNumParticleGroups() {
+//        return siteTypes.size();
+//    }
+//    void getParticlesInGroup(int index, std::vector<int>& particles) {
+//        particles = siteParticles[index];
+//    }
+//    bool areGroupsIdentical(int group1, int group2) {
+//        if (siteTypes[group1] != siteTypes[group2])
+//            return false;
+//        int numParticles = siteWeights[group1].size();
+//        if (siteWeights[group2].size() != numParticles)
+//            return false;
+//        for (int i = 0; i < numParticles; i++)
+//            if (siteWeights[group1][i] != siteWeights[group2][i])
+//                return false;
+//        return true;
+//    }
+//private:
+//    vector<const type_info*> siteTypes;
+//    vector<vector<int> > siteParticles;
+//    vector<vector<double> > siteWeights;
+//};
+//
+//
+//void CudaContext::findMoleculeGroups() {
+//    // The first time this is called, we need to identify all the molecules in the system.
+//    
+//    if (moleculeGroups.size() == 0) {
+//        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
+//
+//        addForce(new VirtualSiteInfo(system));
+//
+//        // First make a list of every other atom to which each atom is connect by a constraint or force group.
+//
+//        vector<vector<int> > atomBonds(system.getNumParticles());
+//        for (int i = 0; i < system.getNumConstraints(); i++) {
+//            int particle1, particle2;
+//            double distance;
+//            system.getConstraintParameters(i, particle1, particle2, distance);
+//            atomBonds[particle1].push_back(particle2);
+//            atomBonds[particle2].push_back(particle1);
+//        }
+//        for (int i = 0; i < (int) forces.size(); i++) {
+//            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
+//                vector<int> particles;
+//                forces[i]->getParticlesInGroup(j, particles);
+//                for (int k = 0; k < (int) particles.size(); k++)
+//                    for (int m = 0; m < (int) particles.size(); m++)
+//                        if (k != m)
+//                            atomBonds[particles[k]].push_back(particles[m]);
+//            }
+//        }
+//
+//        // Now tag atoms by which molecule they belong to.
+//
+//        vector<int> atomMolecule(numAtoms, -1);
+//        int numMolecules = 0;
+//        for (int i = 0; i < numAtoms; i++)
+//            if (atomMolecule[i] == -1)
+//                tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
+//        vector<vector<int> > atomIndices(numMolecules);
+//        for (int i = 0; i < numAtoms; i++)
+//            atomIndices[atomMolecule[i]].push_back(i);
+//
+//        // Construct a description of each molecule.
+//
+//        molecules.resize(numMolecules);
+//        for (int i = 0; i < numMolecules; i++) {
+//            molecules[i].atoms = atomIndices[i];
+//            molecules[i].groups.resize(forces.size());
+//        }
+//        for (int i = 0; i < system.getNumConstraints(); i++) {
+//            int particle1, particle2;
+//            double distance;
+//            system.getConstraintParameters(i, particle1, particle2, distance);
+//            molecules[atomMolecule[particle1]].constraints.push_back(i);
+//        }
+//        for (int i = 0; i < (int) forces.size(); i++)
+//            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
+//                vector<int> particles;
+//                forces[i]->getParticlesInGroup(j, particles);
+//                molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
+//            }
+//    }
+//
+//    // Sort them into groups of identical molecules.
+//
+//    vector<Molecule> uniqueMolecules;
+//    vector<vector<int> > moleculeInstances;
+//    vector<vector<int> > moleculeOffsets;
+//    for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
+//        Molecule& mol = molecules[molIndex];
+//
+//        // See if it is identical to another molecule.
+//
+//        bool isNew = true;
+//        for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
+//            Molecule& mol2 = uniqueMolecules[j];
+//            bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
+//
+//            // See if the atoms are identical.
+//
+//            int atomOffset = mol2.atoms[0]-mol.atoms[0];
+//            for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
+//                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
+//                    identical = false;
+//                for (int k = 0; k < (int) forces.size(); k++)
+//                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
+//                        identical = false;
+//            }
+//            
+//            // See if the constraints are identical.
+//
+//            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
+//                int c1particle1, c1particle2, c2particle1, c2particle2;
+//                double distance1, distance2;
+//                system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
+//                system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
+//                if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
+//                    identical = false;
+//            }
+//
+//            // See if the force groups are identical.
+//
+//            for (int i = 0; i < (int) forces.size() && identical; i++) {
+//                if (mol.groups[i].size() != mol2.groups[i].size())
+//                    identical = false;
+//                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
+//                    if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
+//                        identical = false;
+//            }
+//            if (identical) {
+//                moleculeInstances[j].push_back(molIndex);
+//                moleculeOffsets[j].push_back(mol.atoms[0]);
+//                isNew = false;
+//            }
+//        }
+//        if (isNew) {
+//            uniqueMolecules.push_back(mol);
+//            moleculeInstances.push_back(vector<int>());
+//            moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
+//            moleculeOffsets.push_back(vector<int>());
+//            moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
+//        }
+//    }
+//    moleculeGroups.resize(moleculeInstances.size());
+//    for (int i = 0; i < (int) moleculeInstances.size(); i++)
+//    {
+//        moleculeGroups[i].instances = moleculeInstances[i];
+//        moleculeGroups[i].offsets = moleculeOffsets[i];
+//        vector<int>& atoms = uniqueMolecules[i].atoms;
+//        moleculeGroups[i].atoms.resize(atoms.size());
+//        for (int j = 0; j < (int) atoms.size(); j++)
+//            moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
+//    }
+//}
+//
+//void CudaContext::invalidateMolecules() {
+//    moleculesInvalid = true;
+//}
+//
+//
+//void OpenCLContext::validateMolecules() {
+//    moleculesInvalid = false;
+//    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
+//        return;
+//    bool valid = true;
+//    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
+//        MoleculeGroup& mol = moleculeGroups[group];
+//        vector<int>& instances = mol.instances;
+//        vector<int>& offsets = mol.offsets;
+//        vector<int>& atoms = mol.atoms;
+//        int numMolecules = instances.size();
+//        Molecule& m1 = molecules[instances[0]];
+//        int offset1 = offsets[0];
+//        for (int j = 1; valid && j < numMolecules; j++) {
+//            // See if the atoms are identical.
+//
+//            Molecule& m2 = molecules[instances[j]];
+//            int offset2 = offsets[j];
+//            for (int i = 0; i < (int) atoms.size() && valid; i++) {
+//                for (int k = 0; k < (int) forces.size(); k++)
+//                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
+//                        valid = false;
+//            }
+//
+//            // See if the force groups are identical.
+//
+//            for (int i = 0; i < (int) forces.size() && valid; i++) {
+//                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
+//                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
+//                        valid = false;
+//            }
+//        }
+//    }
+//    if (valid)
+//        return;
+//    
+//    // The list of which molecules are identical is no longer valid.  We need to restore the
+//    // atoms to their original order, rebuild the list of identical molecules, and sort them
+//    // again.
+//    
+//    vector<mm_float4> newPosq(numAtoms);
+//    vector<mm_float4> newVelm(numAtoms);
+//    vector<mm_int4> newCellOffsets(numAtoms);
+//    posq->download();
+//    velm->download();
+//    for (int i = 0; i < numAtoms; i++) {
+//        int index = atomIndex->get(i);
+//        newPosq[index] = posq->get(i);
+//        newVelm[index] = velm->get(i);
+//        newCellOffsets[index] = posCellOffsets[i];
+//    }
+//    for (int i = 0; i < numAtoms; i++) {
+//        posq->set(i, newPosq[i]);
+//        velm->set(i, newVelm[i]);
+//        atomIndex->set(i, i);
+//        posCellOffsets[i] = newCellOffsets[i];
+//    }
+//    posq->upload();
+//    velm->upload();
+//    atomIndex->upload();
+//    findMoleculeGroups();
+//    for (int i = 0; i < (int) reorderListeners.size(); i++)
+//        reorderListeners[i]->execute();
+//}
+//
+//void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
+//    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
+//        return;
+//    if (moleculesInvalid)
+//        validateMolecules();
+//    atomsWereReordered = true;
+//
+//    // Find the range of positions and the number of bins along each axis.
+//
+//    posq->download();
+//    velm->download();
+//    float minx = posq->get(0).x, maxx = posq->get(0).x;
+//    float miny = posq->get(0).y, maxy = posq->get(0).y;
+//    float minz = posq->get(0).z, maxz = posq->get(0).z;
+//    if (nonbonded->getUsePeriodic()) {
+//        minx = miny = minz = 0.0;
+//        maxx = periodicBoxSize.x;
+//        maxy = periodicBoxSize.y;
+//        maxz = periodicBoxSize.z;
+//    }
+//    else {
+//        for (int i = 1; i < numAtoms; i++) {
+//            const mm_float4& pos = posq->get(i);
+//            minx = min(minx, pos.x);
+//            maxx = max(maxx, pos.x);
+//            miny = min(miny, pos.y);
+//            maxy = max(maxy, pos.y);
+//            minz = min(minz, pos.z);
+//            maxz = max(maxz, pos.z);
+//        }
+//    }
+//
+//    // Loop over each group of identical molecules and reorder them.
+//
+//    vector<int> originalIndex(numAtoms);
+//    vector<mm_float4> newPosq(numAtoms);
+//    vector<mm_float4> newVelm(numAtoms);
+//    vector<mm_int4> newCellOffsets(numAtoms);
+//    for (int group = 0; group < (int) moleculeGroups.size(); group++) {
+//        // Find the center of each molecule.
+//
+//        MoleculeGroup& mol = moleculeGroups[group];
+//        int numMolecules = mol.offsets.size();
+//        vector<int>& atoms = mol.atoms;
+//        vector<mm_float4> molPos(numMolecules);
+//        float invNumAtoms = 1.0f/atoms.size();
+//        for (int i = 0; i < numMolecules; i++) {
+//            molPos[i].x = 0.0f;
+//            molPos[i].y = 0.0f;
+//            molPos[i].z = 0.0f;
+//            for (int j = 0; j < (int)atoms.size(); j++) {
+//                int atom = atoms[j]+mol.offsets[i];
+//                const mm_float4& pos = posq->get(atom);
+//                molPos[i].x += pos.x;
+//                molPos[i].y += pos.y;
+//                molPos[i].z += pos.z;
+//            }
+//            molPos[i].x *= invNumAtoms;
+//            molPos[i].y *= invNumAtoms;
+//            molPos[i].z *= invNumAtoms;
+//        }
+//        if (nonbonded->getUsePeriodic()) {
+//            // Move each molecule position into the same box.
+//
+//            for (int i = 0; i < numMolecules; i++) {
+//                int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
+//                int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
+//                int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
+//                float dx = xcell*periodicBoxSize.x;
+//                float dy = ycell*periodicBoxSize.y;
+//                float dz = zcell*periodicBoxSize.z;
+//                if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
+//                    molPos[i].x -= dx;
+//                    molPos[i].y -= dy;
+//                    molPos[i].z -= dz;
+//                    if (enforcePeriodic) {
+//                        for (int j = 0; j < (int) atoms.size(); j++) {
+//                            int atom = atoms[j]+mol.offsets[i];
+//                            mm_float4 p = posq->get(atom);
+//                            p.x -= dx;
+//                            p.y -= dy;
+//                            p.z -= dz;
+//                            posq->set(atom, p);
+//                            posCellOffsets[atom].x -= xcell;
+//                            posCellOffsets[atom].y -= ycell;
+//                            posCellOffsets[atom].z -= zcell;
+//                        }
+//                    }
+//                }
+//            }
+//        }
+//
+//        // Select a bin for each molecule, then sort them by bin.
+//
+//        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
+//        float binWidth;
+//        if (useHilbert)
+//            binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
+//        else
+//            binWidth = (float)(0.2*nonbonded->getCutoffDistance());
+//        float invBinWidth = 1.0f/binWidth;
+//        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
+//        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
+//        vector<pair<int, int> > molBins(numMolecules);
+//        bitmask_t coords[3];
+//        for (int i = 0; i < numMolecules; i++) {
+//            int x = (int) ((molPos[i].x-minx)*invBinWidth);
+//            int y = (int) ((molPos[i].y-miny)*invBinWidth);
+//            int z = (int) ((molPos[i].z-minz)*invBinWidth);
+//            int bin;
+//            if (useHilbert) {
+//                coords[0] = x;
+//                coords[1] = y;
+//                coords[2] = z;
+//                bin = (int) hilbert_c2i(3, 8, coords);
+//            }
+//            else {
+//                int yodd = y&1;
+//                int zodd = z&1;
+//                bin = z*xbins*ybins;
+//                bin += (zodd ? ybins-y : y)*xbins;
+//                bin += (yodd ? xbins-x : x);
+//            }
+//            molBins[i] = pair<int, int>(bin, i);
+//        }
+//        sort(molBins.begin(), molBins.end());
+//
+//        // Reorder the atoms.
+//
+//        for (int i = 0; i < numMolecules; i++) {
+//            for (int j = 0; j < (int)atoms.size(); j++) {
+//                int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
+//                int newIndex = mol.offsets[i]+atoms[j];
+//                originalIndex[newIndex] = atomIndex->get(oldIndex);
+//                newPosq[newIndex] = posq->get(oldIndex);
+//                newVelm[newIndex] = velm->get(oldIndex);
+//                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
+//            }
+//        }
+//    }
+//
+//    // Update the streams.
+//
+//    for (int i = 0; i < numAtoms; i++) {
+//        posq->set(i, newPosq[i]);
+//        velm->set(i, newVelm[i]);
+//        atomIndex->set(i, originalIndex[i]);
+//        posCellOffsets[i] = newCellOffsets[i];
+//    }
+//    posq->upload();
+//    velm->upload();
+//    atomIndex->upload();
+//    for (int i = 0; i < (int) reorderListeners.size(); i++)
+//        reorderListeners[i]->execute();
+//}
+struct CudaContext::WorkThread::ThreadData {
+    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
+            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
+        tasks(tasks), waiting(waiting), finished(finished), queueLock(queueLock),
+        waitForTaskCondition(waitForTaskCondition), queueEmptyCondition(queueEmptyCondition) {
+    }
+    std::queue<CudaContext::WorkTask*>& tasks;
+    bool& waiting;
+    bool& finished;
+    pthread_mutex_t& queueLock;
+    pthread_cond_t& waitForTaskCondition;
+    pthread_cond_t& queueEmptyCondition;
+};
+static void* threadBody(void* args) {
+    CudaContext::WorkThread::ThreadData& data = *reinterpret_cast<CudaContext::WorkThread::ThreadData*>(args);
+    while (!data.finished || data.tasks.size() > 0) {
+        pthread_mutex_lock(&data.queueLock);
+        while (data.tasks.empty() && !data.finished) {
+            data.waiting = true;
+            pthread_cond_signal(&data.queueEmptyCondition);
+            pthread_cond_wait(&data.waitForTaskCondition, &data.queueLock);
+        }
+        CudaContext::WorkTask* task = NULL;
+        if (!data.tasks.empty()) {
+            data.waiting = false;
+            task = data.tasks.front();
+            data.tasks.pop();
+        }
+        pthread_mutex_unlock(&data.queueLock);
+        if (task != NULL) {
+            task->execute();
+            delete task;
+        }
+    }
+    data.waiting = true;
+    pthread_cond_signal(&data.queueEmptyCondition);
+    delete &data;
+    return 0;
+}
+CudaContext::WorkThread::WorkThread() : waiting(true), finished(false) {
+    pthread_mutex_init(&queueLock, NULL);
+    pthread_cond_init(&waitForTaskCondition, NULL);
+    pthread_cond_init(&queueEmptyCondition, NULL);
+    ThreadData* data = new ThreadData(tasks, waiting, finished, queueLock, waitForTaskCondition, queueEmptyCondition);
+    pthread_create(&thread, NULL, threadBody, data);
+}
+CudaContext::WorkThread::~WorkThread() {
+    pthread_mutex_lock(&queueLock);
+    finished = true;
+    pthread_cond_broadcast(&waitForTaskCondition);
+    pthread_mutex_unlock(&queueLock);
+    pthread_join(thread, NULL);
+    pthread_mutex_destroy(&queueLock);
+    pthread_cond_destroy(&waitForTaskCondition);
+    pthread_cond_destroy(&queueEmptyCondition);
+}
+void CudaContext::WorkThread::addTask(CudaContext::WorkTask* task) {
+    pthread_mutex_lock(&queueLock);
+    tasks.push(task);
+    waiting = false;
+    pthread_cond_signal(&waitForTaskCondition);
+    pthread_mutex_unlock(&queueLock);
+}
+bool CudaContext::WorkThread::isWaiting() {
+    return waiting;
+}
+bool CudaContext::WorkThread::isFinished() {
+    return finished;
+}
+void CudaContext::WorkThread::flush() {
+    pthread_mutex_lock(&queueLock);
+    while (!waiting)
+       pthread_cond_wait(&queueEmptyCondition, &queueLock);
+    pthread_mutex_unlock(&queueLock);
+}
--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
+#ifndef OPENMM_CUDACONTEXT_H_
+#define OPENMM_CUDACONTEXT_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include <map>
+#include <queue>
+#include <string>
+#include <pthread.h>
+#define __CL_ENABLE_EXCEPTIONS
+#ifdef _MSC_VER
+    // Prevent Windows from defining macros that interfere with other code.
+    #define NOMINMAX
+#endif
+#include <cuda.h>
+#include <builtin_types.h>
+#include <vector_functions.h>
+#include "openmm/internal/windowsExport.h"
+#include "CudaPlatform.h"
+namespace OpenMM {
+class CudaArray;
+class CudaForceInfo;
+class CudaIntegrationUtilities;
+class CudaBondedUtilities;
+class CudaNonbondedUtilities;
+class System;
+/**
+ * This class contains the information associated with a Context by the CUDA Platform.  Each CudaContext is
+ * specific to a particular device, and manages data structures and kernels for that device.  When running a simulation
+ * in parallel on multiple devices, there is a separate CudaContext for each one.  The list of all contexts is
+ * stored in the CudaPlatform::PlatformData.
+ * <p>
+ * In addition, a worker thread is created for each CudaContext.  This is used for parallel computations, so that
+ * blocking calls to one device will not block other devices.  When only a single device is being used, the worker
+ * thread is not used and calculations are performed on the main application thread.
+ */
+class OPENMM_EXPORT CudaContext {
+public:
+    class WorkTask;
+    class WorkThread;
+    class ReorderListener;
+    static const int ThreadBlockSize;
+    static const int TileSize;
+    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
+            const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
+    ~CudaContext();
+//    /**
+//     * This is called to initialize internal data structures after all Forces in the system
+//     * have been initialized.
+//     */
+//    void initialize();
+    /**
+     * Add a CudaForce to this context.
+     */
+    void addForce(CudaForceInfo* force);
+    /**
+     * Get the CUcontext associated with this object.
+     */
+    CUcontext getContext() {
+        return context;
+    }
+    /**
+     * Get the CUdevice associated with this object.
+     */
+    CUdevice getDevice() {
+        return device;
+    }
+    /**
+     * Get the index of the CUdevice associated with this object.
+     */
+    int getDeviceIndex() {
+        return deviceIndex;
+    }
+    /**
+     * Get the PlatformData object this context is part of.
+     */
+    CudaPlatform::PlatformData& getPlatformData() {
+        return platformData;
+    }
+    /**
+     * Get the index of this context in the list stored in the PlatformData.
+     */
+    int getContextIndex() const {
+        return contextIndex;
+    }
+    /**
+     * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
+     */
+    CudaArray& getPosq() {
+        return *posq;
+    }
+    /**
+     * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
+     */
+    CudaArray& getVelm() {
+        return *velm;
+    }
+//    /**
+//     * Get the array which contains the force on each atom.
+//     */
+//    CudaArray<mm_float4>& getForce() {
+//        return *force;
+//    }
+//    /**
+//     * Get the array which contains the buffers in which forces are computed.
+//     */
+//    CudaArray<mm_float4>& getForceBuffers() {
+//        return *forceBuffers;
+//    }
+//    /**
+//     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
+//     */
+//    CudaArray<cl_long>& getLongForceBuffer() {
+//        return *longForceBuffer;
+//    }
+//    /**
+//     * Get the array which contains the buffer in which energy is computed.
+//     */
+//    CudaArray<cl_float>& getEnergyBuffer() {
+//        return *energyBuffer;
+//    }
+//    /**
+//     * Get the array which contains the index of each atom.
+//     */
+//    CudaArray<cl_int>& getAtomIndex() {
+//        return *atomIndex;
+//    }
+//    /**
+//     * Get the number of cells by which the positions are offset.
+//     */
+//    std::vector<mm_int4>& getPosCellOffsets() {
+//        return posCellOffsets;
+//    }
+    /**
+     * Replace all occurrences of a list of substrings.
+     *
+     * @param input   a string to process
+     * @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string
+     * @return a new string produced by performing the replacements
+     */
+    std::string replaceStrings(const std::string& input, const std::map<std::string, std::string>& replacements) const;
+    /**
+     * Create a CUDA module from source code.
+     *
+     * @param source             the source code of the module
+     * @param optimizationFlags  the optimization flags to pass to the CUDA compiler.  If this is
+     *                           omitted, a default set of options will be used
+     */
+    CUmodule createModule(const std::string source, const char* optimizationFlags = NULL);
+    /**
+     * Create a CUDA module from source code.
+     *
+     * @param source             the source code of the module
+     * @param defines            a set of preprocessor definitions (name, value) to define when compiling the program
+     * @param optimizationFlags  the optimization flags to pass to the CUDA compiler.  If this is
+     *                           omitted, a default set of options will be used
+     */
+    CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
+//    /**
+//     * Execute a kernel.
+//     *
+//     * @param kernel       the kernel to execute
+//     * @param workUnits    the maximum number of work units that should be used
+//     * @param blockSize    the size of each thread block to use
+//     */
+//    void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
+//    /**
+//     * Set all elements of an array to 0.
+//     */
+//    void clearBuffer(CudaArray<float>& array);
+//    /**
+//     * Set all elements of an array to 0.
+//     */
+//    void clearBuffer(CudaArray<mm_float4>& array);
+//    /**
+//     * Set all elements of an array to 0.
+//     *
+//     * @param memory     the Memory to clear
+//     * @param size       the number of float elements in the buffer
+//     */
+//    void clearBuffer(cl::Memory& memory, int size);
+//    /**
+//     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
+//     *
+//     * @param memory     the Memory to clear
+//     * @param size       the number of float elements in the buffer
+//     */
+//    void addAutoclearBuffer(cl::Memory& memory, int size);
+//    /**
+//     * Clear all buffers that have been registered with addAutoclearBuffer().
+//     */
+//    void clearAutoclearBuffers();
+//    /**
+//     * Given a collection of buffers packed into an array, sum them and store
+//     * the sum in the first buffer.
+//     *
+//     * @param array       the array containing the buffers to reduce
+//     * @param numBuffers  the number of buffers packed into the array
+//     */
+//    void reduceBuffer(CudaArray<mm_float4>& array, int numBuffers);
+//    /**
+//     * Sum the buffesr containing forces.
+//     */
+//    void reduceForces();
+//    /**
+//     * Get the current simulation time.
+//     */
+//    double getTime() {
+//        return time;
+//    }
+//    /**
+//     * Set the current simulation time.
+//     */
+//    void setTime(double t) {
+//        time = t;
+//    }
+//    /**
+//     * Get the number of integration steps that have been taken.
+//     */
+//    int getStepCount() {
+//        return stepCount;
+//    }
+//    /**
+//     * Set the number of integration steps that have been taken.
+//     */
+//    void setStepCount(int steps) {
+//        stepCount = steps;
+//    }
+//    /**
+//     * Get the number of times forces or energy has been computed.
+//     */
+//    int getComputeForceCount() {
+//        return computeForceCount;
+//    }
+//    /**
+//     * Set the number of times forces or energy has been computed.
+//     */
+//    void setComputeForceCount(int count) {
+//        computeForceCount = count;
+//    }
+//    /**
+//     * Get the number of atoms.
+//     */
+//    int getNumAtoms() const {
+//        return numAtoms;
+//    }
+//    /**
+//     * Get the number of atoms, rounded up to a multiple of TileSize.  This is the actual size of
+//     * most arrays with one element per atom.
+//     */
+//    int getPaddedNumAtoms() const {
+//        return paddedNumAtoms;
+//    }
+//    /**
+//     * Get the number of blocks of TileSize atoms.
+//     */
+//    int getNumAtomBlocks() const {
+//        return numAtomBlocks;
+//    }
+//    /**
+//     * Get the standard number of thread blocks to use when executing kernels.
+//     */
+//    int getNumThreadBlocks() const {
+//        return numThreadBlocks;
+//    }
+//    /**
+//     * Get the number of force buffers.
+//     */
+//    int getNumForceBuffers() const {
+//        return numForceBuffers;
+//    }
+//    /**
+//     * Get the SIMD width of the device being used.
+//     */
+//    int getSIMDWidth() const {
+//        return simdWidth;
+//    }
+//    /**
+//     * Get whether the device being used supports 64 bit atomic operations on global memory.
+//     */
+//    bool getSupports64BitGlobalAtomics() {
+//        return supports64BitGlobalAtomics;
+//    }
+//    /**
+//     * Get whether the device being used supports double precision math.
+//     */
+//    bool getSupportsDoublePrecision() {
+//        return supportsDoublePrecision;
+//    }
+//    /**
+//     * Get the size of the periodic box.
+//     */
+//    mm_float4 getPeriodicBoxSize() const {
+//        return periodicBoxSize;
+//    }
+//    /**
+//     * Set the size of the periodic box.
+//     */
+//    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
+//        periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
+//        invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
+//    }
+//    /**
+//     * Get the inverse of the size of the periodic box.
+//     */
+//    mm_float4 getInvPeriodicBoxSize() const {
+//        return invPeriodicBoxSize;
+//    }
+//    /**
+//     * Get the CudaIntegrationUtilities for this context.
+//     */
+//    CudaIntegrationUtilities& getIntegrationUtilities() {
+//        return *integration;
+//    }
+//    /**
+//     * Get the CudaBondedUtilities for this context.
+//     */
+//    CudaBondedUtilities& getBondedUtilities() {
+//        return *bonded;
+//    }
+//    /**
+//     * Get the CudaNonbondedUtilities for this context.
+//     */
+//    CudaNonbondedUtilities& getNonbondedUtilities() {
+//        return *nonbonded;
+//    }
+//    /**
+//     * Get the thread used by this context for executing parallel computations.
+//     */
+//    WorkThread& getWorkThread() {
+//        return *thread;
+//    }
+//    /**
+//     * Get whether atoms were reordered during the most recent force/energy computation.
+//     */
+//    bool getAtomsWereReordered() const {
+//        return atomsWereReordered;
+//    }
+//    /**
+//     * Set whether atoms were reordered during the most recent force/energy computation.
+//     */
+//    void setAtomsWereReordered(bool wereReordered) {
+//        atomsWereReordered = wereReordered;
+//    }
+//    /**
+//     * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
+//     * together in the arrays.
+//     * 
+//     * @param enforcePeriodic    if true, the atom positions may be altered to enforce periodic boundary conditions
+//     */
+//    void reorderAtoms(bool enforcePeriodic);
+//    /**
+//     * Add a listener that should be called whenever atoms get reordered.  The CudaContext
+//     * assumes ownership of the object, and deletes it when the context itself is deleted.
+//     */
+//    void addReorderListener(ReorderListener* listener);
+//    /**
+//     * Get the list of ReorderListeners.
+//     */
+//    std::vector<ReorderListener*>& getReorderListeners() {
+//        return reorderListeners;
+//    }
+//    /**
+//     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
+//     * This should be called whenever force field parameters change.  It will cause the definitions
+//     * and order to be revalidated the next to reorderAtoms() is called.
+//     */
+//    void invalidateMolecules();
+//    /**
+//     * Get whether the current molecule definitions are valid.
+//     */
+//    bool getMoleculesAreInvalid() {
+//        return moleculesInvalid;
+//    }
+private:
+    struct Molecule;
+    struct MoleculeGroup;
+    class VirtualSiteInfo;
+//    void findMoleculeGroups();
+//    static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
+//    /**
+//     * Ensure that all molecules marked as "identical" really are identical.  This should be
+//     * called whenever force field parameters change.  If necessary, it will rebuild the list
+//     * of molecules and resort the atoms.
+//     */
+//    void validateMolecules();
+    static bool hasInitializedCuda;
+    const System& system;
+    double time;
+    CudaPlatform::PlatformData& platformData;
+    int deviceIndex;
+    int contextIndex;
+    int stepCount;
+    int computeForceCount;
+    int numAtoms;
+    int paddedNumAtoms;
+    int numAtomBlocks;
+    int numThreadBlocks;
+//    int numForceBuffers;
+//    int simdWidth;
+    bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
+    std::string compiler, tempDir, gpuArchitecture;
+    float4 periodicBoxSize;
+    float4 invPeriodicBoxSize;
+    std::string defaultOptimizationOptions;
+    std::map<std::string, std::string> compilationDefines;
+    CUcontext context;
+    CUdevice device;
+    CUfunction clearBufferKernel;
+    CUfunction clearTwoBuffersKernel;
+    CUfunction clearThreeBuffersKernel;
+    CUfunction clearFourBuffersKernel;
+    CUfunction clearFiveBuffersKernel;
+    CUfunction clearSixBuffersKernel;
+    CUfunction reduceFloat4Kernel;
+    CUfunction reduceForcesKernel;
+    std::vector<CudaForceInfo*> forces;
+    std::vector<Molecule> molecules;
+    std::vector<MoleculeGroup> moleculeGroups;
+    std::vector<int4> posCellOffsets;
+    CudaArray* posq;
+    CudaArray* velm;
+//    CudaArray<mm_float4>* force;
+//    CudaArray<mm_float4>* forceBuffers;
+//    CudaArray<cl_long>* longForceBuffer;
+//    CudaArray<cl_float>* energyBuffer;
+//    CudaArray<cl_int>* atomIndex;
+//    std::vector<cl::Memory*> autoclearBuffers;
+//    std::vector<int> autoclearBufferSizes;
+    std::vector<ReorderListener*> reorderListeners;
+//    CudaIntegrationUtilities* integration;
+//    CudaBondedUtilities* bonded;
+//    CudaNonbondedUtilities* nonbonded;
+    WorkThread* thread;
+};
+struct CudaContext::Molecule {
+    std::vector<int> atoms;
+    std::vector<int> constraints;
+    std::vector<std::vector<int> > groups;
+};
+struct CudaContext::MoleculeGroup {
+    std::vector<int> atoms;
+    std::vector<int> instances;
+    std::vector<int> offsets;
+};
+/**
+ * This abstract class defines a task to be executed on the worker thread.
+ */
+class CudaContext::WorkTask {
+public:
+    virtual void execute() = 0;
+    virtual ~WorkTask() {
+    }
+};
+class CudaContext::WorkThread {
+public:
+    struct ThreadData;
+    WorkThread();
+    ~WorkThread();
+    /**
+     * Request that a task be executed on the worker thread.  The argument should have been allocated on the
+     * heap with the "new" operator.  After its execute() method finishes, the object will be deleted automatically.
+     */
+    void addTask(CudaContext::WorkTask* task);
+    /**
+     * Get whether the worker thread is idle, waiting for a task to be added.
+     */
+    bool isWaiting();
+    /**
+     * Get whether the worker thread has exited.
+     */
+    bool isFinished();
+    /**
+     * Block until all tasks have finished executing and the worker thread is idle.
+     */
+    void flush();
+private:
+    std::queue<CudaContext::WorkTask*> tasks;
+    bool waiting, finished;
+    pthread_mutex_t queueLock;
+    pthread_cond_t waitForTaskCondition, queueEmptyCondition;
+    pthread_t thread;
+};
+/**
+ * This abstract class defines a function to be executed whenever atoms get reordered.
+ * Objects that need to know when reordering happens should create a reorderListener
+ * and register it by calling addReorderListener().
+ */
+class CudaContext::ReorderListener {
+public:
+    virtual void execute() = 0;
+    virtual ~ReorderListener() {
+    }
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDACONTEXT_H_*/
--- a/platforms/cuda2/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda2/src/CudaExpressionUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaExpressionUtilities.h"
+#include "openmm/OpenMMException.h"
+#include "openmm/internal/SplineFitter.h"
+#include "lepton/Operation.h"
+using namespace OpenMM;
+using namespace Lepton;
+using namespace std;
+string CudaExpressionUtilities::doubleToString(double value) {
+    stringstream s;
+    s.precision(8);
+    s << scientific << value << "f";
+    return s.str();
+}
+string CudaExpressionUtilities::intToString(int value) {
+    stringstream s;
+    s << value;
+    return s.str();
+}
+string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
+        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
+    vector<pair<ExpressionTreeNode, string> > variableNodes;
+    for (map<string, string>::const_iterator iter = variables.begin(); iter != variables.end(); ++iter)
+        variableNodes.push_back(make_pair(ExpressionTreeNode(new Operation::Variable(iter->first)), iter->second));
+    return createExpressions(expressions, variableNodes, functions, prefix, functionParams, tempType);
+}
+string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const vector<pair<ExpressionTreeNode, string> >& variables,
+        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
+    stringstream out;
+    vector<ParsedExpression> allExpressions;
+    for (map<string, ParsedExpression>::const_iterator iter = expressions.begin(); iter != expressions.end(); ++iter)
+        allExpressions.push_back(iter->second);
+    vector<pair<ExpressionTreeNode, string> > temps = variables;
+    for (map<string, ParsedExpression>::const_iterator iter = expressions.begin(); iter != expressions.end(); ++iter) {
+        processExpression(out, iter->second.getRootNode(), temps, functions, prefix, functionParams, allExpressions, tempType);
+        out << iter->first << getTempName(iter->second.getRootNode(), temps) << ";\n";
+    }
+    return out.str();
+}
+void CudaExpressionUtilities::processExpression(stringstream& out, const ExpressionTreeNode& node, vector<pair<ExpressionTreeNode, string> >& temps,
+        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const vector<ParsedExpression>& allExpressions, const string& tempType) {
+    for (int i = 0; i < (int) temps.size(); i++)
+        if (temps[i].first == node)
+            return;
+    for (int i = 0; i < (int) node.getChildren().size(); i++)
+        processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
+    string name = prefix+intToString(temps.size());
+    bool hasRecordedNode = false;
+    out << tempType << " " << name << " = ";
+    switch (node.getOperation().getId()) {
+        case Operation::CONSTANT:
+            out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
+            break;
+        case Operation::VARIABLE:
+            throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
+        case Operation::CUSTOM:
+        {
+            int i;
+            for (i = 0; i < (int) functions.size() && functions[i].first != node.getOperation().getName(); i++)
+                ;
+            if (i == functions.size())
+                throw OpenMMException("Unknown function in expression: "+node.getOperation().getName());
+            bool isDeriv = (dynamic_cast<const Operation::Custom*>(&node.getOperation())->getDerivOrder()[0] == 1);
+            out << "0.0f;\n";
+            temps.push_back(make_pair(node, name));
+            hasRecordedNode = true;
+            // If both the value and derivative of the function are needed, it's faster to calculate them both
+            // at once, so check to see if both are needed.
+            const ExpressionTreeNode* valueNode = NULL;
+            const ExpressionTreeNode* derivNode = NULL;
+            for (int j = 0; j < (int) allExpressions.size(); j++)
+                findRelatedTabulatedFunctions(node, allExpressions[j].getRootNode(), valueNode, derivNode);
+            string valueName = name;
+            string derivName = name;
+            if (valueNode != NULL && derivNode != NULL) {
+                string name2 = prefix+intToString(temps.size());
+                out << tempType << " " << name2 << " = 0.0f;\n";
+                if (isDeriv) {
+                    valueName = name2;
+                    temps.push_back(make_pair(*valueNode, name2));
+                }
+                else {
+                    derivName = name2;
+                    temps.push_back(make_pair(*derivNode, name2));
+                }
+            }
+            out << "{\n";
+            out << "float4 params = " << functionParams << "[" << i << "];\n";
+            out << "float x = " << getTempName(node.getChildren()[0], temps) << ";\n";
+            out << "if (x >= params.x && x <= params.y) {\n";
+            out << "x = (x-params.x)*params.z;\n";
+            out << "int index = (int) (floor(x));\n";
+            out << "index = min(index, (int) params.w);\n";
+            out << "float4 coeff = " << functions[i].second << "[index];\n";
+            out << "float b = x-index;\n";
+            out << "float a = 1.0f-b;\n";
+            if (valueNode != NULL)
+                out << valueName << " = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);\n";
+            if (derivNode != NULL)
+                out << derivName << " = (coeff.y-coeff.x)*params.z+((1.0f-3.0f*a*a)*coeff.z+(3.0f*b*b-1.0f)*coeff.w)/params.z;\n";
+            out << "}\n";
+            out << "}";
+            break;
+        }
+        case Operation::ADD:
+            out << getTempName(node.getChildren()[0], temps) << "+" << getTempName(node.getChildren()[1], temps);
+            break;
+        case Operation::SUBTRACT:
+            out << getTempName(node.getChildren()[0], temps) << "-" << getTempName(node.getChildren()[1], temps);
+            break;
+        case Operation::MULTIPLY:
+            out << getTempName(node.getChildren()[0], temps) << "*" << getTempName(node.getChildren()[1], temps);
+            break;
+        case Operation::DIVIDE:
+        {
+            bool haveReciprocal = false;
+            for (int i = 0; i < (int) temps.size(); i++)
+                if (temps[i].first.getOperation().getId() == Operation::RECIPROCAL && temps[i].first.getChildren()[0] == node.getChildren()[1]) {
+                    haveReciprocal = true;
+                    out << getTempName(node.getChildren()[0], temps) << "*" << temps[i].second;
+                }
+            if (!haveReciprocal)
+                out << getTempName(node.getChildren()[0], temps) << "/" << getTempName(node.getChildren()[1], temps);
+            break;
+        }
+        case Operation::POWER:
+            out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            break;
+        case Operation::NEGATE:
+            out << "-" << getTempName(node.getChildren()[0], temps);
+            break;
+        case Operation::SQRT:
+            out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::EXP:
+            out << "EXP(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::LOG:
+            out << "LOG(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::SIN:
+            out << "sin(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::COS:
+            out << "cos(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::SEC:
+            out << "1.0f/cos(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::CSC:
+            out << "1.0f/sin(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::TAN:
+            out << "tan(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::COT:
+            out << "1.0f/tan(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ASIN:
+            out << "asin(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ACOS:
+            out << "acos(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ATAN:
+            out << "atan(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::SINH:
+            out << "sinh(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::COSH:
+            out << "cosh(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::TANH:
+            out << "tanh(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ERF:
+            out << "erf(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ERFC:
+            out << "erfc(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::STEP:
+            out << getTempName(node.getChildren()[0], temps) << " >= 0.0f ? 1.0f : 0.0f";
+            break;
+        case Operation::DELTA:
+            out << getTempName(node.getChildren()[0], temps) << " == 0.0f ? 1.0f : 0.0f";
+            break;
+        case Operation::SQUARE:
+        {
+            string arg = getTempName(node.getChildren()[0], temps);
+            out << arg << "*" << arg;
+            break;
+        }
+        case Operation::CUBE:
+        {
+            string arg = getTempName(node.getChildren()[0], temps);
+            out << arg << "*" << arg << "*" << arg;
+            break;
+        }
+        case Operation::RECIPROCAL:
+            out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ADD_CONSTANT:
+            out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
+            break;
+        case Operation::MULTIPLY_CONSTANT:
+            out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
+            break;
+        case Operation::POWER_CONSTANT:
+        {
+            double exponent = dynamic_cast<const Operation::PowerConstant*>(&node.getOperation())->getValue();
+            if (exponent == 0.0)
+                out << "1.0f";
+            else if (exponent == (int) exponent) {
+                out << "0.0f;\n";
+                temps.push_back(make_pair(node, name));
+                hasRecordedNode = true;
+                // If multiple integral powers of the same base are needed, it's faster to calculate all of them
+                // at once, so check to see if others are also needed.
+                map<int, const ExpressionTreeNode*> powers;
+                powers[(int) exponent] = &node;
+                for (int j = 0; j < (int) allExpressions.size(); j++)
+                    findRelatedPowers(node, allExpressions[j].getRootNode(), powers);
+                vector<int> exponents;
+                vector<string> names;
+                vector<bool> hasAssigned(powers.size(), false);
+                exponents.push_back((int) fabs(exponent));
+                names.push_back(name);
+                for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
+                    if (iter->first != exponent) {
+                        exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
+                        string name2 = prefix+intToString(temps.size());
+                        names.push_back(name2);
+                        temps.push_back(make_pair(*iter->second, name2));
+                        out << tempType << " " << name2 << " = 0.0f;\n";
+                    }
+                }
+                out << "{\n";
+                out << "float multiplier = " << (exponent < 0.0 ? "1.0f/" : "") << getTempName(node.getChildren()[0], temps) << ";\n";
+                bool done = false;
+                while (!done) {
+                    done = true;
+                    for (int i = 0; i < (int) exponents.size(); i++) {
+                        if (exponents[i]%2 == 1) {
+                            if (!hasAssigned[i])
+                                out << names[i] << " = multiplier;\n";
+                            else
+                                out << names[i] << " *= multiplier;\n";
+                            hasAssigned[i] = true;
+                        }
+                        exponents[i] >>= 1;
+                        if (exponents[i] != 0)
+                            done = false;
+                    }
+                    if (!done)
+                        out << "multiplier *= multiplier;\n";
+                }
+                out << "}";
+            }
+            else
+                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")";
+            break;
+        }
+        case Operation::MIN:
+            out << "min(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            break;
+        case Operation::MAX:
+            out << "max(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            break;
+        case Operation::ABS:
+            out << "fabs(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        default:
+            throw OpenMMException("Internal error: Unknown operation in user-defined expression: "+node.getOperation().getName());
+    }
+    out << ";\n";
+    if (!hasRecordedNode)
+        temps.push_back(make_pair(node, name));
+}
+string CudaExpressionUtilities::getTempName(const ExpressionTreeNode& node, const vector<pair<ExpressionTreeNode, string> >& temps) {
+    for (int i = 0; i < (int) temps.size(); i++)
+        if (temps[i].first == node)
+            return temps[i].second;
+    stringstream out;
+    out << "Internal error: No temporary variable for expression node: " << node;
+    throw OpenMMException(out.str());
+}
+void CudaExpressionUtilities::findRelatedTabulatedFunctions(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode,
+            const ExpressionTreeNode*& valueNode, const ExpressionTreeNode*& derivNode) {
+    if (searchNode.getOperation().getId() == Operation::CUSTOM && node.getChildren()[0] == searchNode.getChildren()[0]) {
+        if (dynamic_cast<const Operation::Custom*>(&searchNode.getOperation())->getDerivOrder()[0] == 0)
+            valueNode = &searchNode;
+        else
+            derivNode = &searchNode;
+    }
+    else
+        for (int i = 0; i < (int) searchNode.getChildren().size(); i++)
+            findRelatedTabulatedFunctions(node, searchNode.getChildren()[i], valueNode, derivNode);
+}
+void CudaExpressionUtilities::findRelatedPowers(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode, map<int, const ExpressionTreeNode*>& powers) {
+    if (searchNode.getOperation().getId() == Operation::POWER_CONSTANT && node.getChildren()[0] == searchNode.getChildren()[0]) {
+        double realPower = dynamic_cast<const Operation::PowerConstant*>(&searchNode.getOperation())->getValue();
+        int power = (int) realPower;
+        if (power != realPower)
+            return; // We are only interested in integer powers.
+        if (powers.find(power) != powers.end())
+            return; // This power is already in the map.
+        if (powers.begin()->first*power < 0)
+            return; // All powers must have the same sign.
+        powers[power] = &searchNode;
+    }
+    else
+        for (int i = 0; i < (int) searchNode.getChildren().size(); i++)
+            findRelatedPowers(node, searchNode.getChildren()[i], powers);
+}
+vector<float4> CudaExpressionUtilities::computeFunctionCoefficients(const vector<double>& values, double min, double max) {
+    // Compute the spline coefficients.
+    int numValues = values.size();
+    vector<double> x(numValues), derivs;
+    for (int i = 0; i < numValues; i++)
+        x[i] = min+i*(max-min)/(numValues-1);
+    SplineFitter::createNaturalSpline(x, values, derivs);
+    vector<float4> f(numValues-1);
+    for (int i = 0; i < (int) values.size()-1; i++)
+        f[i] = make_float4((float) values[i], (float) values[i+1], (float) (derivs[i]/6.0), (float) (derivs[i+1]/6.0));
+    return f;
+}
--- a/platforms/cuda2/src/CudaExpressionUtilities.h
+++ b/platforms/cuda2/src/CudaExpressionUtilities.h
+#ifndef OPENMM_CUDAEXPRESSIONUTILITIES_H_
+#define OPENMM_CUDAEXPRESSIONUTILITIES_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaContext.h"
+#include "lepton/CustomFunction.h"
+#include "lepton/ExpressionTreeNode.h"
+#include "lepton/ParsedExpression.h"
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+namespace OpenMM {
+/**
+ * This class is used by various classes to generate CUDA source code implementing
+ * user defined mathematical expressions.
+ */
+class OPENMM_EXPORT CudaExpressionUtilities {
+public:
+    /**
+     * Generate the source code for calculating a set of expressions.
+     *
+     * @param expressions    the expressions to generate code for (keys are the variables to store the output values in)
+     * @param variables      defines the source code to generate for each variable that may appear in the expressions.  Keys are
+     *                       variable names, and the values are the code to generate for them.
+     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
+     * @param prefix         a prefix to put in front of temporary variables
+     * @param functionParams the variable name containing the parameters for each tabulated function
+     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     */
+    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+    /**
+     * Generate the source code for calculating a set of expressions.
+     *
+     * @param expressions    the expressions to generate code for (keys are the variables to store the output values in)
+     * @param variables      defines the source code to generate for each variable or precomputed sub-expression that may appear in the expressions.
+     *                       Each entry is an ExpressionTreeNode, and the code to generate wherever an identical node appears.
+     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
+     * @param prefix         a prefix to put in front of temporary variables
+     * @param functionParams the variable name containing the parameters for each tabulated function
+     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     */
+    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+    /**
+     * Calculate the spline coefficients for a tabulated function that appears in expressions.
+     *
+     * @param values         the tabulated values of the function
+     * @param min            the value of the independent variable corresponding to the first element of values
+     * @param max            the value of the independent variable corresponding to the last element of values
+     * @return the spline coefficients
+     */
+    static std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     */
+    static std::string doubleToString(double value);
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     */
+    static std::string intToString(int value);
+    class FunctionPlaceholder;
+private:
+    static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
+            std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
+            const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
+    static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
+    static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+            const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
+    static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+            std::map<int, const Lepton::ExpressionTreeNode*>& powers);
+};
+/**
+ * This class serves as a placeholder for custom functions in expressions.
+ */
+class CudaExpressionUtilities::FunctionPlaceholder : public Lepton::CustomFunction {
+public:
+    int getNumArguments() const {
+        return 1;
+    }
+    double evaluate(const double* arguments) const {
+        return 0.0;
+    }
+    double evaluateDerivative(const double* arguments, const int* derivOrder) const {
+        return 0.0;
+    }
+    CustomFunction* clone() const {
+        return new FunctionPlaceholder();
+    }
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAEXPRESSIONUTILITIES_H_*/
--- a/platforms/cuda2/src/CudaForceInfo.cpp
+++ b/platforms/cuda2/src/CudaForceInfo.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaForceInfo.h"
+using namespace OpenMM;
+using namespace std;
+bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
+    return true;
+}
+int CudaForceInfo::getNumParticleGroups() {
+    return 0;
+}
+void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
+    return;
+}
+bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
+    return true;
+}
--- a/platforms/cuda2/src/CudaForceInfo.h
+++ b/platforms/cuda2/src/CudaForceInfo.h
+#ifndef OPENMM_CUDAFORCEINFO_H_
+#define OPENMM_CUDAFORCEINFO_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/internal/windowsExport.h"
+#include <vector>
+namespace OpenMM {
+/**
+ * This class is used by the Cuda implementation of a Force class to convey information
+ * about the behavior and requirements of that force.
+ */
+class OPENMM_EXPORT CudaForceInfo {
+public:
+    CudaForceInfo(int requiredForceBuffers) : requiredForceBuffers(requiredForceBuffers) {
+    }
+    /**
+     * Get the number of force buffers this force requires.
+     */
+    int getRequiredForceBuffers() {
+        return requiredForceBuffers;
+    }
+    /**
+     * Get whether or not two particles have identical force field parameters.
+     */
+    virtual bool areParticlesIdentical(int particle1, int particle2);
+    /**
+     * Get the number of particle groups defined by this force.
+     */
+    virtual int getNumParticleGroups();
+    /**
+     * Get the list of particles in a particular group.
+     */
+    virtual void getParticlesInGroup(int index, std::vector<int>& particles);
+    /**
+     * Get whether two particle groups are identical.
+     */
+    virtual bool areGroupsIdentical(int group1, int group2);
+private:
+    int requiredForceBuffers;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAFORCEINFO_H_*/
--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaKernelFactory.h"
+//#include "CudaParallelKernels.h"
+#include "CudaPlatform.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+using namespace OpenMM;
+KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
+//    if (data.contexts.size() > 1) {
+//        // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
+//        
+//        if (name == CalcForcesAndEnergyKernel::Name())
+//            return new CudaParallelCalcForcesAndEnergyKernel(name, platform, data);
+//        if (name == CalcHarmonicBondForceKernel::Name())
+//            return new CudaParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomBondForceKernel::Name())
+//            return new CudaParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcHarmonicAngleForceKernel::Name())
+//            return new CudaParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomAngleForceKernel::Name())
+//            return new CudaParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcPeriodicTorsionForceKernel::Name())
+//            return new CudaParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcRBTorsionForceKernel::Name())
+//            return new CudaParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCMAPTorsionForceKernel::Name())
+//            return new CudaParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomTorsionForceKernel::Name())
+//            return new CudaParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcNonbondedForceKernel::Name())
+//            return new CudaParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomNonbondedForceKernel::Name())
+//            return new CudaParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomExternalForceKernel::Name())
+//            return new CudaParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomHbondForceKernel::Name())
+//            return new CudaParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomCompoundBondForceKernel::Name())
+//            return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
+//    }
+//    CudaContext& cl = *data.contexts[0];
+//    if (name == CalcForcesAndEnergyKernel::Name())
+//        return new CudaCalcForcesAndEnergyKernel(name, platform, cl);
+//    if (name == UpdateStateDataKernel::Name())
+//        return new CudaUpdateStateDataKernel(name, platform, cl);
+//    if (name == ApplyConstraintsKernel::Name())
+//        return new CudaApplyConstraintsKernel(name, platform, cl);
+//    if (name == VirtualSitesKernel::Name())
+//        return new CudaVirtualSitesKernel(name, platform, cl);
+//    if (name == CalcHarmonicBondForceKernel::Name())
+//        return new CudaCalcHarmonicBondForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomBondForceKernel::Name())
+//        return new CudaCalcCustomBondForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcHarmonicAngleForceKernel::Name())
+//        return new CudaCalcHarmonicAngleForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomAngleForceKernel::Name())
+//        return new CudaCalcCustomAngleForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcPeriodicTorsionForceKernel::Name())
+//        return new CudaCalcPeriodicTorsionForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcRBTorsionForceKernel::Name())
+//        return new CudaCalcRBTorsionForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCMAPTorsionForceKernel::Name())
+//        return new CudaCalcCMAPTorsionForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomTorsionForceKernel::Name())
+//        return new CudaCalcCustomTorsionForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcNonbondedForceKernel::Name())
+//        return new CudaCalcNonbondedForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomNonbondedForceKernel::Name())
+//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcGBSAOBCForceKernel::Name())
+//        return new CudaCalcGBSAOBCForceKernel(name, platform, cl);
+//    if (name == CalcCustomGBForceKernel::Name())
+//        return new CudaCalcCustomGBForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomExternalForceKernel::Name())
+//        return new CudaCalcCustomExternalForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomHbondForceKernel::Name())
+//        return new CudaCalcCustomHbondForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomCompoundBondForceKernel::Name())
+//        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cl, context.getSystem());
+//    if (name == IntegrateVerletStepKernel::Name())
+//        return new CudaIntegrateVerletStepKernel(name, platform, cl);
+//    if (name == IntegrateLangevinStepKernel::Name())
+//        return new CudaIntegrateLangevinStepKernel(name, platform, cl);
+//    if (name == IntegrateBrownianStepKernel::Name())
+//        return new CudaIntegrateBrownianStepKernel(name, platform, cl);
+//    if (name == IntegrateVariableVerletStepKernel::Name())
+//        return new CudaIntegrateVariableVerletStepKernel(name, platform, cl);
+//    if (name == IntegrateVariableLangevinStepKernel::Name())
+//        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cl);
+//    if (name == IntegrateCustomStepKernel::Name())
+//        return new CudaIntegrateCustomStepKernel(name, platform, cl);
+//    if (name == ApplyAndersenThermostatKernel::Name())
+//        return new CudaApplyAndersenThermostatKernel(name, platform, cl);
+//    if (name == ApplyMonteCarloBarostatKernel::Name())
+//        return new CudaApplyMonteCarloBarostatKernel(name, platform, cl);
+//    if (name == CalcKineticEnergyKernel::Name())
+//        return new CudaCalcKineticEnergyKernel(name, platform, cl);
+//    if (name == RemoveCMMotionKernel::Name())
+//        return new CudaRemoveCMMotionKernel(name, platform, cl);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/platforms/cuda2/src/CudaKernelSources.cpp.in
+++ b/platforms/cuda2/src/CudaKernelSources.cpp.in
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaKernelSources.h"
+using namespace OpenMM;
+using namespace std;
--- a/platforms/cuda2/src/CudaKernelSources.h.in
+++ b/platforms/cuda2/src/CudaKernelSources.h.in
+#ifndef OPENMM_CUDAKERNELSOURCES_H_
+#define OPENMM_CUDAKERNELSOURCES_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/internal/windowsExport.h"
+#include <string>
+namespace OpenMM {
+/**
+ * This class is a central holding place for the source code of CUDA kernels.
+ * The CMake build script inserts declarations into it based on the .cu files in the
+ * kernels subfolder.
+ */
+class OPENMM_EXPORT CudaKernelSources {
+public:
+@CUDA_FILE_DECLARATIONS@
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAKERNELSOURCES_H_*/
--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
+#ifndef OPENMM_CUDAKERNELS_H_
+#define OPENMM_CUDAKERNELS_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaPlatform.h"
+//#include "CudaArray.h"
+#include "CudaContext.h"
+//#include "CudaFFT3D.h"
+//#include "CudaParameterSet.h"
+//#include "CudaSort.h"
+#include "openmm/kernels.h"
+#include "openmm/System.h"
+namespace OpenMM {
+///**
+// * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
+// * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
+// * necessary work at the end to determine the final results.
+// */
+//class CudaCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
+//public:
+//    CudaCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcForcesAndEnergyKernel(name, platform), cl(cl) {
+//    }
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     */
+//    void initialize(const System& system);
+//    /**
+//     * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
+//     * any ForceImpl.
+//     *
+//     * @param context       the context in which to execute this kernel
+//     * @param includeForce  true if forces should be computed
+//     * @param includeEnergy true if potential energy should be computed
+//     * @param groups        a set of bit flags for which force groups to include
+//     */
+//    void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+//    /**
+//     * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
+//     * every ForceImpl.
+//     *
+//     * @param context       the context in which to execute this kernel
+//     * @param includeForce  true if forces should be computed
+//     * @param includeEnergy true if potential energy should be computed
+//     * @param groups        a set of bit flags for which force groups to include
+//     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
+//     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
+//     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
+//     */
+//    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+//private:
+//   CudaContext& cl;
+//};
+//
+///**
+// * This kernel provides methods for setting and retrieving various state data: time, positions,
+// * velocities, and forces.
+// */
+//class CudaUpdateStateDataKernel : public UpdateStateDataKernel {
+//public:
+//    CudaUpdateStateDataKernel(std::string name, const Platform& platform, CudaContext& cl) : UpdateStateDataKernel(name, platform), cl(cl) {
+//    }
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     */
+//    void initialize(const System& system);
+//    /**
+//     * Get the current time (in picoseconds).
+//     *
+//     * @param context    the context in which to execute this kernel
+//     */
+//    double getTime(const ContextImpl& context) const;
+//    /**
+//     * Set the current time (in picoseconds).
+//     *
+//     * @param context    the context in which to execute this kernel
+//     */
+//    void setTime(ContextImpl& context, double time);
+//    /**
+//     * Get the positions of all particles.
+//     *
+//     * @param positions  on exit, this contains the particle positions
+//     */
+//    void getPositions(ContextImpl& context, std::vector<Vec3>& positions);
+//    /**
+//     * Set the positions of all particles.
+//     *
+//     * @param positions  a vector containg the particle positions
+//     */
+//    void setPositions(ContextImpl& context, const std::vector<Vec3>& positions);
+//    /**
+//     * Get the velocities of all particles.
+//     *
+//     * @param velocities  on exit, this contains the particle velocities
+//     */
+//    void getVelocities(ContextImpl& context, std::vector<Vec3>& velocities);
+//    /**
+//     * Set the velocities of all particles.
+//     *
+//     * @param velocities  a vector containg the particle velocities
+//     */
+//    void setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities);
+//    /**
+//     * Get the current forces on all particles.
+//     *
+//     * @param forces  on exit, this contains the forces
+//     */
+//    void getForces(ContextImpl& context, std::vector<Vec3>& forces);
+//    /**
+//     * Get the current periodic box vectors.
+//     *
+//     * @param a      on exit, this contains the vector defining the first edge of the periodic box
+//     * @param b      on exit, this contains the vector defining the second edge of the periodic box
+//     * @param c      on exit, this contains the vector defining the third edge of the periodic box
+//     */
+//    void getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const;
+//    /**
+//     * Set the current periodic box vectors.
+//     *
+//     * @param a      the vector defining the first edge of the periodic box
+//     * @param b      the vector defining the second edge of the periodic box
+//     * @param c      the vector defining the third edge of the periodic box
+//     */
+//    void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
+//    /**
+//     * Create a checkpoint recording the current state of the Context.
+//     * 
+//     * @param stream    an output stream the checkpoint data should be written to
+//     */
+//    void createCheckpoint(ContextImpl& context, std::ostream& stream);
+//    /**
+//     * Load a checkpoint that was written by createCheckpoint().
+//     * 
+//     * @param stream    an input stream the checkpoint data should be read from
+//     */
+//    void loadCheckpoint(ContextImpl& context, std::istream& stream);
+//private:
+//    CudaContext& cl;
+//};
+//
+///**
+// * This kernel modifies the positions of particles to enforce distance constraints.
+// */
+//class CudaApplyConstraintsKernel : public ApplyConstraintsKernel {
+//public:
+//    CudaApplyConstraintsKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyConstraintsKernel(name, platform),
+//            cl(cl), hasInitializedKernel(false) {
+//    }
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     */
+//    void initialize(const System& system);
+//    /**
+//     * Update particle positions to enforce constraints.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     * @param tol        the distance tolerance within which constraints must be satisfied.
+//     */
+//    void apply(ContextImpl& context, double tol);
+//private:
+//    CudaContext& cl;
+//    bool hasInitializedKernel;
+//    cl::Kernel applyDeltasKernel;
+//};
+//
+///**
+// * This kernel recomputes the positions of virtual sites.
+// */
+//class CudaVirtualSitesKernel : public VirtualSitesKernel {
+//public:
+//    CudaVirtualSitesKernel(std::string name, const Platform& platform, CudaContext& cl) : VirtualSitesKernel(name, platform), cl(cl) {
+//    }
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     */
+//    void initialize(const System& system);
+//    /**
+//     * Compute the virtual site locations.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     */
+//    void computePositions(ContextImpl& context);
+//private:
+//    CudaContext& cl;
+//};
+//
+///**
+// * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
+//public:
+//    CudaCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcHarmonicBondForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
+//    }
+//    ~CudaCalcHarmonicBondForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the HarmonicBondForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const HarmonicBondForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numBonds;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaArray<mm_float2>* params;
+//};
+//
+///**
+// * This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
+//public:
+//    CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomBondForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
+//    }
+//    ~CudaCalcCustomBondForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CustomBondForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomBondForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numBonds;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaParameterSet* params;
+//    CudaArray<cl_float>* globals;
+//    std::vector<std::string> globalParamNames;
+//    std::vector<cl_float> globalParamValues;
+//};
+//
+///**
+// * This kernel is invoked by HarmonicAngleForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
+//public:
+//    CudaCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcHarmonicAngleForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
+//    }
+//    ~CudaCalcHarmonicAngleForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the HarmonicAngleForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const HarmonicAngleForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numAngles;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaArray<mm_float2>* params;
+//};
+//
+///**
+// * This kernel is invoked by CustomAngleForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
+//public:
+//    CudaCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomAngleForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
+//    }
+//    ~CudaCalcCustomAngleForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CustomAngleForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomAngleForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numAngles;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaParameterSet* params;
+//    CudaArray<cl_float>* globals;
+//    std::vector<std::string> globalParamNames;
+//    std::vector<cl_float> globalParamValues;
+//};
+//
+///**
+// * This kernel is invoked by PeriodicTorsionForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
+//public:
+//    CudaCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcPeriodicTorsionForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
+//    }
+//    ~CudaCalcPeriodicTorsionForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the PeriodicTorsionForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const PeriodicTorsionForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numTorsions;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaArray<mm_float4>* params;
+//};
+//
+///**
+// * This kernel is invoked by RBTorsionForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
+//public:
+//    CudaCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcRBTorsionForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
+//    }
+//    ~CudaCalcRBTorsionForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the RBTorsionForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const RBTorsionForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numTorsions;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaArray<mm_float8>* params;
+//};
+//
+///**
+// * This kernel is invoked by CMAPTorsionForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
+//public:
+//    CudaCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCMAPTorsionForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), coefficients(NULL), mapPositions(NULL), torsionMaps(NULL) {
+//    }
+//    ~CudaCalcCMAPTorsionForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CMAPTorsionForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CMAPTorsionForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numTorsions;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaArray<mm_float4>* coefficients;
+//    CudaArray<mm_int2>* mapPositions;
+//    CudaArray<cl_int>* torsionMaps;
+//};
+//
+///**
+// * This kernel is invoked by CustomTorsionForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
+//public:
+//    CudaCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomTorsionForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
+//    }
+//    ~CudaCalcCustomTorsionForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CustomTorsionForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomTorsionForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numTorsions;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaParameterSet* params;
+//    CudaArray<cl_float>* globals;
+//    std::vector<std::string> globalParamNames;
+//    std::vector<cl_float> globalParamValues;
+//};
+//
+///**
+// * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
+// */
+//class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
+//public:
+//    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
+//            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
+//            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
+//    }
+//    ~CudaCalcNonbondedForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the NonbondedForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const NonbondedForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @param includeDirect  true if direct space interactions should be included
+//     * @param includeReciprocal  true if reciprocal space interactions should be included
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the NonbondedForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
+//private:
+//    struct SortTrait {
+//        typedef mm_int2 DataType;
+//        typedef cl_int KeyType;
+//        static const char* clDataType() {return "int2";}
+//        static const char* clKeyType() {return "int";}
+//        static const char* clMinKey() {return "INT_MIN";}
+//        static const char* clMaxKey() {return "INT_MAX";}
+//        static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
+//        static const char* clSortKey() {return "value.y";}
+//    };
+//    CudaContext& cl;
+//    bool hasInitializedKernel;
+//    CudaArray<mm_float2>* sigmaEpsilon;
+//    CudaArray<mm_float4>* exceptionParams;
+//    CudaArray<mm_float2>* cosSinSums;
+//    CudaArray<mm_float2>* pmeGrid;
+//    CudaArray<mm_float2>* pmeGrid2;
+//    CudaArray<cl_float>* pmeBsplineModuliX;
+//    CudaArray<cl_float>* pmeBsplineModuliY;
+//    CudaArray<cl_float>* pmeBsplineModuliZ;
+//    CudaArray<mm_float4>* pmeBsplineTheta;
+//    CudaArray<mm_float4>* pmeBsplineDTheta;
+//    CudaArray<cl_int>* pmeAtomRange;
+//    CudaArray<mm_int2>* pmeAtomGridIndex;
+//    CudaSort<SortTrait>* sort;
+//    CudaFFT3D* fft;
+//    cl::Kernel ewaldSumsKernel;
+//    cl::Kernel ewaldForcesKernel;
+//    cl::Kernel pmeGridIndexKernel;
+//    cl::Kernel pmeAtomRangeKernel;
+//    cl::Kernel pmeZIndexKernel;
+//    cl::Kernel pmeUpdateBsplinesKernel;
+//    cl::Kernel pmeSpreadChargeKernel;
+//    cl::Kernel pmeFinishSpreadChargeKernel;
+//    cl::Kernel pmeConvolutionKernel;
+//    cl::Kernel pmeInterpolateForceKernel;
+//    std::map<std::string, std::string> pmeDefines;
+//    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+//    int interpolateForceThreads;
+//    bool hasCoulomb, hasLJ;
+//    static const int PmeOrder = 5;
+//};
+//
+///**
+// * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
+// */
+//class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
+//public:
+//    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomNonbondedForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
+//    }
+//    ~CudaCalcCustomNonbondedForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CustomNonbondedForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomNonbondedForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    CudaParameterSet* params;
+//    CudaArray<cl_float>* globals;
+//    CudaArray<mm_float4>* tabulatedFunctionParams;
+//    std::vector<std::string> globalParamNames;
+//    std::vector<cl_float> globalParamValues;
+//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
+//    System& system;
+//};
+//
+///**
+// * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
+// */
+//class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
+//public:
+//    CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcGBSAOBCForceKernel(name, platform), cl(cl),
+//            hasCreatedKernels(false), params(NULL), bornSum(NULL), longBornSum(NULL), bornRadii(NULL), bornForce(NULL),
+//            longBornForce(NULL), obcChain(NULL) {
+//    }
+//    ~CudaCalcGBSAOBCForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the GBSAOBCForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const GBSAOBCForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    double prefactor;
+//    bool hasCreatedKernels;
+//    int maxTiles;
+//    CudaContext& cl;
+//    CudaArray<mm_float2>* params;
+//    CudaArray<cl_float>* bornSum;
+//    CudaArray<cl_long>* longBornSum;
+//    CudaArray<cl_float>* bornRadii;
+//    CudaArray<cl_float>* bornForce;
+//    CudaArray<cl_long>* longBornForce;
+//    CudaArray<cl_float>* obcChain;
+//    cl::Kernel computeBornSumKernel;
+//    cl::Kernel reduceBornSumKernel;
+//    cl::Kernel force1Kernel;
+//    cl::Kernel reduceBornForceKernel;
+//};
+//
+///**
+// * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
+// */
+//class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
+//public:
+//    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomGBForceKernel(name, platform),
+//            hasInitializedKernels(false), cl(cl), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
+//            valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
+//    }
+//    ~CudaCalcCustomGBForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CustomGBForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomGBForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    bool hasInitializedKernels, needParameterGradient;
+//    int maxTiles, numComputedValues;
+//    CudaContext& cl;
+//    CudaParameterSet* params;
+//    CudaParameterSet* computedValues;
+//    CudaParameterSet* energyDerivs;
+//    CudaArray<cl_long>* longEnergyDerivs;
+//    CudaArray<cl_float>* globals;
+//    CudaArray<cl_float>* valueBuffers;
+//    CudaArray<cl_long>* longValueBuffers;
+//    CudaArray<mm_float4>* tabulatedFunctionParams;
+//    std::vector<std::string> globalParamNames;
+//    std::vector<cl_float> globalParamValues;
+//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
+//    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
+//    System& system;
+//    cl::Kernel pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
+//};
+//
+///**
+// * This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
+// */
+//class CudaCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
+//public:
+//    CudaCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomExternalForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
+//    }
+//    ~CudaCalcCustomExternalForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CustomExternalForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomExternalForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numParticles;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    System& system;
+//    CudaParameterSet* params;
+//    CudaArray<cl_float>* globals;
+//    std::vector<std::string> globalParamNames;
+//    std::vector<cl_float> globalParamValues;
+//};
+//
+///**
+// * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
+// */
+//class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
+//public:
+//    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomHbondForceKernel(name, platform),
+//            hasInitializedKernel(false), cl(cl), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
+//            donorBufferIndices(NULL), acceptorBufferIndices(NULL), globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL),
+//            tabulatedFunctionParams(NULL), system(system) {
+//    }
+//    ~CudaCalcCustomHbondForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CustomHbondForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomHbondForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numDonors, numAcceptors;
+//    bool hasInitializedKernel;
+//    CudaContext& cl;
+//    CudaParameterSet* donorParams;
+//    CudaParameterSet* acceptorParams;
+//    CudaArray<cl_float>* globals;
+//    CudaArray<mm_int4>* donors;
+//    CudaArray<mm_int4>* acceptors;
+//    CudaArray<mm_int4>* donorBufferIndices;
+//    CudaArray<mm_int4>* acceptorBufferIndices;
+//    CudaArray<mm_int4>* donorExclusions;
+//    CudaArray<mm_int4>* acceptorExclusions;
+//    CudaArray<mm_float4>* tabulatedFunctionParams;
+//    std::vector<std::string> globalParamNames;
+//    std::vector<cl_float> globalParamValues;
+//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
+//    System& system;
+//    cl::Kernel donorKernel, acceptorKernel;
+//};
+//
+///**
+// * This kernel is invoked by CustomCompoundBondForce to calculate the forces acting on the system.
+// */
+//class CudaCalcCustomCompoundBondForceKernel : public CalcCustomCompoundBondForceKernel {
+//public:
+//    CudaCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomCompoundBondForceKernel(name, platform),
+//            cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
+//    }
+//    ~CudaCalcCustomCompoundBondForceKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CustomCompoundBondForce this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomCompoundBondForce& force);
+//    /**
+//     * Execute the kernel to calculate the forces and/or energy.
+//     *
+//     * @param context        the context in which to execute this kernel
+//     * @param includeForces  true if forces should be calculated
+//     * @param includeEnergy  true if the energy should be calculated
+//     * @return the potential energy due to the force
+//     */
+//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//private:
+//    int numBonds;
+//    CudaContext& cl;
+//    CudaParameterSet* params;
+//    CudaArray<cl_float>* globals;
+//    CudaArray<mm_float4>* tabulatedFunctionParams;
+//    std::vector<std::string> globalParamNames;
+//    std::vector<cl_float> globalParamValues;
+//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
+//    System& system;
+//};
+//
+///**
+// * This kernel is invoked by VerletIntegrator to take one time step.
+// */
+//class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
+//public:
+//    CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVerletStepKernel(name, platform), cl(cl),
+//            hasInitializedKernels(false) {
+//    }
+//    ~CudaIntegrateVerletStepKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param integrator the VerletIntegrator this kernel will be used for
+//     */
+//    void initialize(const System& system, const VerletIntegrator& integrator);
+//    /**
+//     * Execute the kernel.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     * @param integrator the VerletIntegrator this kernel is being used for
+//     */
+//    void execute(ContextImpl& context, const VerletIntegrator& integrator);
+//private:
+//    CudaContext& cl;
+//    double prevStepSize;
+//    bool hasInitializedKernels;
+//    cl::Kernel kernel1, kernel2;
+//};
+//
+///**
+// * This kernel is invoked by LangevinIntegrator to take one time step.
+// */
+//class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
+//public:
+//    CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateLangevinStepKernel(name, platform), cl(cl),
+//            hasInitializedKernels(false), params(NULL) {
+//    }
+//    ~CudaIntegrateLangevinStepKernel();
+//    /**
+//     * Initialize the kernel, setting up the particle masses.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param integrator the LangevinIntegrator this kernel will be used for
+//     */
+//    void initialize(const System& system, const LangevinIntegrator& integrator);
+//    /**
+//     * Execute the kernel.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     * @param integrator the LangevinIntegrator this kernel is being used for
+//     */
+//    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
+//private:
+//    CudaContext& cl;
+//    double prevTemp, prevFriction, prevStepSize;
+//    bool hasInitializedKernels;
+//    CudaArray<cl_float>* params;
+//    cl::Kernel kernel1, kernel2;
+//};
+//
+///**
+// * This kernel is invoked by BrownianIntegrator to take one time step.
+// */
+//class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
+//public:
+//    CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateBrownianStepKernel(name, platform), cl(cl),
+//            hasInitializedKernels(false), prevTemp(-1), prevFriction(-1), prevStepSize(-1) {
+//    }
+//    ~CudaIntegrateBrownianStepKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param integrator the BrownianIntegrator this kernel will be used for
+//     */
+//    void initialize(const System& system, const BrownianIntegrator& integrator);
+//    /**
+//     * Execute the kernel.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     * @param integrator the BrownianIntegrator this kernel is being used for
+//     */
+//    void execute(ContextImpl& context, const BrownianIntegrator& integrator);
+//private:
+//    CudaContext& cl;
+//    double prevTemp, prevFriction, prevStepSize;
+//    bool hasInitializedKernels;
+//    cl::Kernel kernel1, kernel2;
+//};
+//
+///**
+// * This kernel is invoked by VariableVerletIntegrator to take one time step.
+// */
+//class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
+//public:
+//    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVariableVerletStepKernel(name, platform), cl(cl),
+//            hasInitializedKernels(false) {
+//    }
+//    ~CudaIntegrateVariableVerletStepKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param integrator the VerletIntegrator this kernel will be used for
+//     */
+//    void initialize(const System& system, const VariableVerletIntegrator& integrator);
+//    /**
+//     * Execute the kernel.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     * @param integrator the VerletIntegrator this kernel is being used for
+//     * @param maxTime    the maximum time beyond which the simulation should not be advanced
+//     * @return the size of the step that was taken
+//     */
+//    double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
+//private:
+//    CudaContext& cl;
+//    bool hasInitializedKernels;
+//    int blockSize;
+//    cl::Kernel kernel1, kernel2, selectSizeKernel;
+//};
+//
+///**
+// * This kernel is invoked by VariableLangevinIntegrator to take one time step.
+// */
+//class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
+//public:
+//    CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVariableLangevinStepKernel(name, platform), cl(cl),
+//            hasInitializedKernels(false), params(NULL) {
+//    }
+//    ~CudaIntegrateVariableLangevinStepKernel();
+//    /**
+//     * Initialize the kernel, setting up the particle masses.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param integrator the VariableLangevinIntegrator this kernel will be used for
+//     */
+//    void initialize(const System& system, const VariableLangevinIntegrator& integrator);
+//    /**
+//     * Execute the kernel.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     * @param integrator the VariableLangevinIntegrator this kernel is being used for
+//     * @param maxTime    the maximum time beyond which the simulation should not be advanced
+//     * @return the size of the step that was taken
+//     */
+//    double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
+//private:
+//    CudaContext& cl;
+//    bool hasInitializedKernels;
+//    int blockSize;
+//    CudaArray<cl_float>* params;
+//    cl::Kernel kernel1, kernel2, selectSizeKernel;
+//    double prevTemp, prevFriction, prevErrorTol;
+//};
+//
+///**
+// * This kernel is invoked by CustomIntegrator to take one time step.
+// */
+//class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
+//public:
+//    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateCustomStepKernel(name, platform), cl(cl),
+//            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
+//            uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
+//    }
+//    ~CudaIntegrateCustomStepKernel();
+//    /**
+//     * Initialize the kernel.
+//     * 
+//     * @param system     the System this kernel will be applied to
+//     * @param integrator the CustomIntegrator this kernel will be used for
+//     */
+//    void initialize(const System& system, const CustomIntegrator& integrator);
+//    /**
+//     * Execute the kernel.
+//     * 
+//     * @param context    the context in which to execute this kernel
+//     * @param integrator the CustomIntegrator this kernel is being used for
+//     * @param forcesAreValid if the context has been modified since the last time step, this will be
+//     *                       false to show that cached forces are invalid and must be recalculated.
+//     *                       On exit, this should specify whether the cached forces are valid at the
+//     *                       end of the step.
+//     */
+//    void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
+//    /**
+//     * Get the values of all global variables.
+//     *
+//     * @param context   the context in which to execute this kernel
+//     * @param values    on exit, this contains the values
+//     */
+//    void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
+//    /**
+//     * Set the values of all global variables.
+//     *
+//     * @param context   the context in which to execute this kernel
+//     * @param values    a vector containing the values
+//     */
+//    void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
+//    /**
+//     * Get the values of a per-DOF variable.
+//     *
+//     * @param context   the context in which to execute this kernel
+//     * @param variable  the index of the variable to get
+//     * @param values    on exit, this contains the values
+//     */
+//    void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
+//    /**
+//     * Set the values of a per-DOF variable.
+//     *
+//     * @param context   the context in which to execute this kernel
+//     * @param variable  the index of the variable to get
+//     * @param values    a vector containing the values
+//     */
+//    void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
+//private:
+//    class ReorderListener;
+//    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
+//    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
+//    void recordChangedParameters(ContextImpl& context);
+//    CudaContext& cl;
+//    double prevStepSize;
+//    int numGlobalVariables;
+//    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
+//    mutable bool localValuesAreCurrent;
+//    CudaArray<cl_float>* globalValues;
+//    CudaArray<cl_float>* contextParameterValues;
+//    CudaArray<cl_float>* sumBuffer;
+//    CudaArray<cl_float>* energy;
+//    CudaArray<mm_float4>* uniformRandoms;
+//    CudaArray<mm_int4>* randomSeed;
+//    CudaParameterSet* perDofValues;
+//    mutable std::vector<std::vector<cl_float> > localPerDofValues;
+//    std::vector<std::vector<cl::Kernel> > kernels;
+//    cl::Kernel sumEnergyKernel, randomKernel;
+//    std::vector<CustomIntegrator::ComputationType> stepType;
+//    std::vector<bool> needsForces;
+//    std::vector<bool> needsEnergy;
+//    std::vector<bool> invalidatesForces;
+//    std::vector<bool> merged;
+//    std::vector<int> forceGroup;
+//    std::vector<int> requiredGaussian;
+//    std::vector<int> requiredUniform;
+//    std::vector<std::string> parameterNames;
+//};
+//
+///**
+// * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.
+// */
+//class CudaApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
+//public:
+//    CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyAndersenThermostatKernel(name, platform), cl(cl),
+//            hasInitializedKernels(false), atomGroups(NULL) {
+//    }
+//    ~CudaApplyAndersenThermostatKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param thermostat the AndersenThermostat this kernel will be used for
+//     */
+//    void initialize(const System& system, const AndersenThermostat& thermostat);
+//    /**
+//     * Execute the kernel.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     */
+//    void execute(ContextImpl& context);
+//private:
+//    CudaContext& cl;
+//    bool hasInitializedKernels;
+//    int randomSeed;
+//    CudaArray<cl_int>* atomGroups;
+//    cl::Kernel kernel;
+//};
+//
+///**
+// * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
+// */
+//class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
+//public:
+//    CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
+//            hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
+//    }
+//    ~CudaApplyMonteCarloBarostatKernel();
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param barostat   the MonteCarloBarostat this kernel will be used for
+//     */
+//    void initialize(const System& system, const MonteCarloBarostat& barostat);
+//    /**
+//     * Attempt a Monte Carlo step, scaling particle positions (or cluster centers) by a specified value.
+//     * This is called BEFORE the periodic box size is modified.  It should begin by translating each particle
+//     * or cluster into the first periodic box, so that coordinates will still be correct after the box size
+//     * is changed.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     * @param scale      the scale factor by which to multiply particle positions
+//     */
+//    void scaleCoordinates(ContextImpl& context, double scale);
+//    /**
+//     * Reject the most recent Monte Carlo step, restoring the particle positions to where they were before
+//     * scaleCoordinates() was last called.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     */
+//    void restoreCoordinates(ContextImpl& context);
+//private:
+//    CudaContext& cl;
+//    bool hasInitializedKernels;
+//    int numMolecules;
+//    CudaArray<mm_float4>* savedPositions;
+//    CudaArray<cl_int>* moleculeAtoms;
+//    CudaArray<cl_int>* moleculeStartIndex;
+//    cl::Kernel kernel;
+//};
+//
+///**
+// * This kernel is invoked to calculate the kinetic energy of the system.
+// */
+//class CudaCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
+//public:
+//    CudaCalcKineticEnergyKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcKineticEnergyKernel(name, platform), cl(cl) {
+//    }
+//    /**
+//     * Initialize the kernel.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     */
+//    void initialize(const System& system);
+//    /**
+//     * Execute the kernel.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     */
+//    double execute(ContextImpl& context);
+//private:
+//    CudaContext& cl;
+//    std::vector<double> masses;
+//};
+//
+///**
+// * This kernel is invoked to remove center of mass motion from the system.
+// */
+//class CudaRemoveCMMotionKernel : public RemoveCMMotionKernel {
+//public:
+//    CudaRemoveCMMotionKernel(std::string name, const Platform& platform, CudaContext& cl) : RemoveCMMotionKernel(name, platform), cl(cl), cmMomentum(NULL) {
+//    }
+//    ~CudaRemoveCMMotionKernel();
+//    /**
+//     * Initialize the kernel, setting up the particle masses.
+//     *
+//     * @param system     the System this kernel will be applied to
+//     * @param force      the CMMotionRemover this kernel will be used for
+//     */
+//    void initialize(const System& system, const CMMotionRemover& force);
+//    /**
+//     * Execute the kernel.
+//     *
+//     * @param context    the context in which to execute this kernel
+//     */
+//    void execute(ContextImpl& context);
+//private:
+//    CudaContext& cl;
+//    int frequency;
+//    CudaArray<mm_float4>* cmMomentum;
+//    cl::Kernel kernel1, kernel2;
+//};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAKERNELS_H_*/
--- a/platforms/cuda2/src/CudaPlatform.cpp
+++ b/platforms/cuda2/src/CudaPlatform.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaContext.h"
+#include "CudaExpressionUtilities.h"
+#include "CudaPlatform.h"
+#include "CudaKernelFactory.h"
+#include "CudaKernels.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/Context.h"
+#include "openmm/System.h"
+#include <algorithm>
+#include <cctype>
+#include <sstream>
+using namespace OpenMM;
+using namespace std;
+extern "C" OPENMM_EXPORT void registerPlatforms() {
+    Platform::registerPlatform(new CudaPlatform());
+}
+CudaPlatform::CudaPlatform() {
+    CudaKernelFactory* factory = new CudaKernelFactory();
+    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
+    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
+    registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
+    registerKernelFactory(VirtualSitesKernel::Name(), factory);
+    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
+    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
+    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
+    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
+    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
+    registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
+    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
+    platformProperties.push_back(CudaDeviceIndex());
+    platformProperties.push_back(CudaUseBlockingSync());
+    platformProperties.push_back(CudaPrecision());
+    platformProperties.push_back(CudaCompiler());
+    platformProperties.push_back(CudaTempDirectory());
+    setPropertyDefaultValue(CudaDeviceIndex(), "");
+    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
+    setPropertyDefaultValue(CudaPrecision(), "single");
+#ifdef _MSC_VER
+    setPropertyDefaultValue(CudaCompiler(), "nvcc");
+    setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TEMP")));
+#else
+    setPropertyDefaultValue(CudaCompiler(), "/usr/local/cuda/bin/nvcc");
+    setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TMPDIR")));
+#endif
+}
+bool CudaPlatform::supportsDoublePrecision() const {
+    return false;
+}
+const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
+    const ContextImpl& impl = getContextImpl(context);
+    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
+    map<string, string>::const_iterator value = data->propertyValues.find(property);
+    if (value != data->propertyValues.end())
+        return value->second;
+    return Platform::getPropertyValue(context, property);
+}
+void CudaPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
+}
+void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
+    const string& devicePropValue = (properties.find(CudaDeviceIndex()) == properties.end() ?
+            getPropertyDefaultValue(CudaDeviceIndex()) : properties.find(CudaDeviceIndex())->second);
+    string blockingPropValue = (properties.find(CudaUseBlockingSync()) == properties.end() ?
+            getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
+    string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ?
+            getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second);
+    const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ?
+            getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
+    const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
+            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
+    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
+    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
+    context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
+}
+void CudaPlatform::contextDestroyed(ContextImpl& context) const {
+    PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
+    delete data;
+}
+CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
+            const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+    bool blocking = (blockingProperty == "true");
+    vector<string> devices;
+    size_t searchPos = 0, nextPos;
+    while ((nextPos = deviceIndexProperty.find_first_of(", ", searchPos)) != string::npos) {
+        devices.push_back(deviceIndexProperty.substr(searchPos, nextPos-searchPos));
+        searchPos = nextPos+1;
+    }
+    devices.push_back(deviceIndexProperty.substr(searchPos));
+    for (int i = 0; i < (int) devices.size(); i++) {
+        if (devices[i].length() > 0) {
+            unsigned int deviceIndex;
+            stringstream(devices[i]) >> deviceIndex;
+            contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, *this));
+        }
+    }
+    if (contexts.size() == 0)
+        contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, *this));
+    stringstream device;
+    for (int i = 0; i < (int) contexts.size(); i++) {
+        if (i > 0)
+            device << ',';
+        device << contexts[i]->getDeviceIndex();
+    }
+    propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str();
+    propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
+    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
+    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
+    contextEnergy.resize(contexts.size());
+}
+CudaPlatform::PlatformData::~PlatformData() {
+    for (int i = 0; i < (int) contexts.size(); i++)
+        delete contexts[i];
+}
+void CudaPlatform::PlatformData::initializeContexts(const System& system) {
+//    for (int i = 0; i < (int) contexts.size(); i++)
+//        contexts[i]->initialize();
+}
+void CudaPlatform::PlatformData::syncContexts() {
+//    for (int i = 0; i < (int) contexts.size(); i++)
+//        contexts[i]->getWorkThread().flush();
+}
--- a/platforms/cuda2/src/kernels/utilities.cu
+++ b/platforms/cuda2/src/kernels/utilities.cu
+/**
+ * This is called by the various functions below to clear a buffer.
+ */
+__device__ void clearSingleBuffer(int* __restrict__ buffer, int size) {
+    int index = blockDim.x*blockIdx.x+threadIdx.x;
+    int4* buffer4 = (int4*) buffer;
+    int sizeDiv4 = size/4;
+    while (index < sizeDiv4) {
+        buffer4[index] = make_int4(0);
+        index += blockDim.x*gridDim.x;
+    }
+    if (blockDim.x*blockIdx.x+threadIdx.x == 0)
+        for (int i = sizeDiv4*4; i < size; i++)
+            buffer[i] = 0;
+}
+/**
+ * Fill a buffer with 0.
+ */
+__global__ void clearBuffer(int* __restrict__ buffer, int size) {
+    clearSingleBuffer(buffer, size);
+}
+/**
+ * Fill two buffers with 0.
+ */
+__global__ void clearTwoBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+}
+/**
+ * Fill three buffers with 0.
+ */
+__global__ void clearThreeBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+}
+/**
+ * Fill four buffers with 0.
+ */
+__global__ void clearFourBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+}
+/**
+ * Fill five buffers with 0.
+ */
+__global__ void clearFiveBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+    clearSingleBuffer(buffer5, size5);
+}
+/**
+ * Fill six buffers with 0.
+ */
+__global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5, int* __restrict__ buffer6, int size6) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+    clearSingleBuffer(buffer5, size5);
+    clearSingleBuffer(buffer6, size6);
+}
+/**
+ * Sum a collection of buffers into the first one.
+ */
+__global__ void reduceFloat4Buffer(float4* __restrict__ buffer, int bufferSize, int numBuffers) {
+    int index = blockDim.x*blockIdx.x+threadIdx.x;
+    int totalSize = bufferSize*numBuffers;
+    while (index < bufferSize) {
+        float4 sum = buffer[index];
+        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        buffer[index] = sum;
+        index += blockDim.x*gridDim.x;
+    }
+}
+/**
+ * Sum the various buffers containing forces.
+ */
+__global__ void reduceForces(const long* __restrict__ longBuffer, float4* __restrict__ buffer, int bufferSize, int numBuffers) {
+    int totalSize = bufferSize*numBuffers;
+    float scale = 1.0f/(float) 0xFFFFFFFF;
+    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
+        float4 sum = make_float4(scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
+        for (int i = index; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        buffer[index] = sum;
+    }
+}
--- a/platforms/cuda2/src/kernels/vectorOps.cu
+++ b/platforms/cuda2/src/kernels/vectorOps.cu
+/**
+ * This file defines vector operations to simplify code elsewhere.
+ */
+// Versions of make_x() that take a single value and set all components to that.
+inline __device__ int2 make_int2(int a) {
+    return make_int2(a, a);
+}
+inline __device__ int3 make_int3(int a) {
+    return make_int3(a, a, a);
+}
+inline __device__ int4 make_int4(int a) {
+    return make_int4(a, a, a, a);
+}
+inline __device__ float2 make_float2(float a) {
+    return make_float2(a, a);
+}
+inline __device__ float3 make_float3(float a) {
+    return make_float3(a, a, a);
+}
+inline __device__ float4 make_float4(float a) {
+    return make_float4(a, a, a, a);
+}
+inline __device__ double2 make_double2(double a) {
+    return make_double2(a, a);
+}
+inline __device__ double3 make_double3(double a) {
+    return make_double3(a, a, a);
+}
+inline __device__ double4 make_double4(double a) {
+    return make_double4(a, a, a, a);
+}
+// Negate a vector.
+inline __device__ int2 operator*(int2 a) {
+    return make_int2(-a.x, -a.y);
+}
+inline __device__ int3 operator-(int3 a) {
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __device__ int4 operator-(int4 a) {
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __device__ float2 operator-(float2 a) {
+    return make_float2(-a.x, -a.y);
+}
+inline __device__ float3 operator-(float3 a) {
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __device__ float4 operator-(float4 a) {
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __device__ double2 operator-(double2 a) {
+    return make_double2(-a.x, -a.y);
+}
+inline __device__ double3 operator-(double3 a) {
+    return make_double3(-a.x, -a.y, -a.z);
+}
+inline __device__ double4 operator-(double4 a) {
+    return make_double4(-a.x, -a.y, -a.z, -a.w);
+}
+// Add two vectors.
+inline __device__ int2 operator+(int2 a, int2 b) {
+    return make_int2(a.x+b.x, a.y+b.y);
+}
+inline __device__ int3 operator+(int3 a, int3 b) {
+    return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+inline __device__ int4 operator+(int4 a, int4 b) {
+    return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+inline __device__ float2 operator+(float2 a, float2 b) {
+    return make_float2(a.x+b.x, a.y+b.y);
+}
+inline __device__ float3 operator+(float3 a, float3 b) {
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+inline __device__ float4 operator+(float4 a, float4 b) {
+    return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+inline __device__ double2 operator+(double2 a, double2 b) {
+    return make_double2(a.x+b.x, a.y+b.y);
+}
+inline __device__ double3 operator+(double3 a, double3 b) {
+    return make_double3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+inline __device__ double4 operator+(double4 a, double4 b) {
+    return make_double4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+// Subtract two vectors.
+inline __device__ int2 operator-(int2 a, int2 b) {
+    return make_int2(a.x-b.x, a.y-b.y);
+}
+inline __device__ int3 operator-(int3 a, int3 b) {
+    return make_int3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+inline __device__ int4 operator-(int4 a, int4 b) {
+    return make_int4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+inline __device__ float2 operator-(float2 a, float2 b) {
+    return make_float2(a.x-b.x, a.y-b.y);
+}
+inline __device__ float3 operator-(float3 a, float3 b) {
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+inline __device__ float4 operator-(float4 a, float4 b) {
+    return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+inline __device__ double2 operator-(double2 a, double2 b) {
+    return make_double2(a.x-b.x, a.y-b.y);
+}
+inline __device__ double3 operator-(double3 a, double3 b) {
+    return make_double3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+inline __device__ double4 operator-(double4 a, double4 b) {
+    return make_double4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+// Multiply two vectors.
+inline __device__ int2 operator*(int2 a, int2 b) {
+    return make_int2(a.x*b.x, a.y*b.y);
+}
+inline __device__ int3 operator*(int3 a, int3 b) {
+    return make_int3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+inline __device__ int4 operator*(int4 a, int4 b) {
+    return make_int4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+inline __device__ float2 operator*(float2 a, float2 b) {
+    return make_float2(a.x*b.x, a.y*b.y);
+}
+inline __device__ float3 operator*(float3 a, float3 b) {
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+inline __device__ float4 operator*(float4 a, float4 b) {
+    return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+inline __device__ double2 operator*(double2 a, double2 b) {
+    return make_double2(a.x*b.x, a.y*b.y);
+}
+inline __device__ double3 operator*(double3 a, double3 b) {
+    return make_double3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+inline __device__ double4 operator*(double4 a, double4 b) {
+    return make_double4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+// Divide two vectors.
+inline __device__ int2 operator/(int2 a, int2 b) {
+    return make_int2(a.x/b.x, a.y/b.y);
+}
+inline __device__ int3 operator/(int3 a, int3 b) {
+    return make_int3(a.x/b.x, a.y/b.y, a.z/b.z);
+}
+inline __device__ int4 operator/(int4 a, int4 b) {
+    return make_int4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+inline __device__ float2 operator/(float2 a, float2 b) {
+    return make_float2(a.x/b.x, a.y/b.y);
+}
+inline __device__ float3 operator/(float3 a, float3 b) {
+    return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+}
+inline __device__ float4 operator/(float4 a, float4 b) {
+    return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+inline __device__ double2 operator/(double2 a, double2 b) {
+    return make_double2(a.x/b.x, a.y/b.y);
+}
+inline __device__ double3 operator/(double3 a, double3 b) {
+    return make_double3(a.x/b.x, a.y/b.y, a.z/b.z);
+}
+inline __device__ double4 operator/(double4 a, double4 b) {
+    return make_double4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+// += operator
+inline __device__ void operator+=(int2& a, int2 b) {
+    a.x += b.x; a.y += b.y;
+}
+inline __device__ void operator+=(int3& a, int3 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __device__ void operator+=(int4& a, int4 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+inline __device__ void operator+=(float2& a, float2 b) {
+    a.x += b.x; a.y += b.y;
+}
+inline __device__ void operator+=(float3& a, float3 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __device__ void operator+=(float4& a, float4 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+inline __device__ void operator+=(double2& a, double2 b) {
+    a.x += b.x; a.y += b.y;
+}
+inline __device__ void operator+=(double3& a, double3 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __device__ void operator+=(double4& a, double4 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+// -= operator
+inline __device__ void operator-=(int2& a, int2 b) {
+    a.x -= b.x; a.y -= b.y;
+}
+inline __device__ void operator-=(int3& a, int3 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __device__ void operator-=(int4& a, int4 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+inline __device__ void operator-=(float2& a, float2 b) {
+    a.x -= b.x; a.y -= b.y;
+}
+inline __device__ void operator-=(float3& a, float3 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __device__ void operator-=(float4& a, float4 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+inline __device__ void operator-=(double2& a, double2 b) {
+    a.x -= b.x; a.y -= b.y;
+}
+inline __device__ void operator-=(double3& a, double3 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __device__ void operator-=(double4& a, double4 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+// *= operator
+inline __device__ void operator*=(int2& a, int2 b) {
+    a.x *= b.x; a.y *= b.y;
+}
+inline __device__ void operator*=(int3& a, int3 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __device__ void operator*=(int4& a, int4 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+inline __device__ void operator*=(float2& a, float2 b) {
+    a.x *= b.x; a.y *= b.y;
+}
+inline __device__ void operator*=(float3& a, float3 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __device__ void operator*=(float4& a, float4 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+inline __device__ void operator*=(double2& a, double2 b) {
+    a.x *= b.x; a.y *= b.y;
+}
+inline __device__ void operator*=(double3& a, double3 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __device__ void operator*=(double4& a, double4 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+// /= operator
+inline __device__ void operator/=(int2& a, int2 b) {
+    a.x /= b.x; a.y /= b.y;
+}
+inline __device__ void operator/=(int3& a, int3 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z;
+}
+inline __device__ void operator/=(int4& a, int4 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
+}
+inline __device__ void operator/=(float2& a, float2 b) {
+    a.x /= b.x; a.y /= b.y;
+}
+inline __device__ void operator/=(float3& a, float3 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z;
+}
+inline __device__ void operator/=(float4& a, float4 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
+}
+inline __device__ void operator/=(double2& a, double2 b) {
+    a.x /= b.x; a.y /= b.y;
+}
+inline __device__ void operator/=(double3& a, double3 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z;
+}
+inline __device__ void operator/=(double4& a, double4 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
+}
+// Multiply a vector by a constant.
+inline __device__ int2 operator*(int2 a, int b) {
+    return make_int2(a.x*b, a.y*b);
+}
+inline __device__ int3 operator*(int3 a, int b) {
+    return make_int3(a.x*b, a.y*b, a.z*b);
+}
+inline __device__ int4 operator*(int4 a, int b) {
+    return make_int4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+inline __device__ int2 operator*(int a, int2 b) {
+    return make_int2(a*b.x, a*b.y);
+}
+inline __device__ int3 operator*(int a, int3 b) {
+    return make_int3(a*b.x, a*b.y, a*b.z);
+}
+inline __device__ int4 operator*(int a, int4 b) {
+    return make_int4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+inline __device__ float2 operator*(float2 a, float b) {
+    return make_float2(a.x*b, a.y*b);
+}
+inline __device__ float3 operator*(float3 a, float b) {
+    return make_float3(a.x*b, a.y*b, a.z*b);
+}
+inline __device__ float4 operator*(float4 a, float b) {
+    return make_float4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+inline __device__ float2 operator*(float a, float2 b) {
+    return make_float2(a*b.x, a*b.y);
+}
+inline __device__ float3 operator*(float a, float3 b) {
+    return make_float3(a*b.x, a*b.y, a*b.z);
+}
+inline __device__ float4 operator*(float a, float4 b) {
+    return make_float4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+inline __device__ double2 operator*(double2 a, double b) {
+    return make_double2(a.x*b, a.y*b);
+}
+inline __device__ double3 operator*(double3 a, double b) {
+    return make_double3(a.x*b, a.y*b, a.z*b);
+}
+inline __device__ double4 operator*(double4 a, double b) {
+    return make_double4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+inline __device__ double2 operator*(double a, double2 b) {
+    return make_double2(a*b.x, a*b.y);
+}
+inline __device__ double3 operator*(double a, double3 b) {
+    return make_double3(a*b.x, a*b.y, a*b.z);
+}
+inline __device__ double4 operator*(double a, double4 b) {
+    return make_double4(a*b.x, a*b.y, a*b.z, a*b.w);
+}