Very early beginnings of new CUDA platform

abb8cb4b · Peter Eastman · 18501459 · abb8cb4b · abb8cb4b · abb8cb4b
Commit abb8cb4b authored Jun 04, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda2/CMakeLists.txt
+++ b/platforms/cuda2/CMakeLists.txt
+#---------------------------------------------------
+# OpenMM CUDA Platform
+#
+# Creates OpenMM library, base name=OpenMMCUDA.
+# Default libraries are shared & optimized. Variants
+# are created for static (_static) and debug (_d).
+#
+# Windows:
+#   OpenMMCUDA[_d].dll
+#   OpenMMCUDA[_d].lib
+#   OpenMMCUDA_static[_d].lib
+# Unix:
+#   libOpenMMCUDA[_d].so
+#   libOpenMMCUDA_static[_d].a
+#----------------------------------------------------
+IF (APPLE)
+    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
+    SET (CMAKE_OSX_SYSROOT "/Developer/SDKs/MacOSX10.6.sdk")
+ENDIF (APPLE)
+set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
+if(OPENMM_BUILD_CUDA_TESTS)
+    SUBDIRS (tests)
+endif(OPENMM_BUILD_CUDA_TESTS)
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+SET(OPENMMCUDA_LIBRARY_NAME OpenMMCUDA)
+SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
+SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
+# Ensure that debug libraries have "_d" appended to their names.
+# CMake gets this right on Windows automatically with this definition.
+IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
+ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+# But on Unix or Cygwin we have to add the suffix manually
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_TARGET ${SHARED_TARGET}_d)
+    SET(STATIC_TARGET ${STATIC_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
+# and absolute pathnames.
+SET(API_REL_INCLUDE_FILES)   # start these out empty
+SET(API_ABS_INCLUDE_FILES)
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
+    FOREACH(pathname ${fullpaths})
+        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
+        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
+    ENDFOREACH(pathname)
+ENDFOREACH(dir)
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+# Set variables needed for encoding kernel sources into a C++ class
+SET(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+SET(CUDA_SOURCE_CLASS CudaKernelSources)
+SET(CUDA_KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.cpp)
+SET(CUDA_KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.h)
+SET(SOURCE_FILES ${SOURCE_FILES} ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H})
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
+SUBDIRS (sharedTarget)
--- a/platforms/cuda2/EncodeCUDAFiles.cmake
+++ b/platforms/cuda2/EncodeCUDAFiles.cmake
+FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
+SET(CUDA_FILE_DECLARATIONS)
+SET(CUDA_FILE_DEFINITIONS)
+CONFIGURE_FILE(${CUDA_SOURCE_DIR}/${CUDA_SOURCE_CLASS}.cpp.in ${CUDA_KERNELS_CPP})
+FOREACH(file ${CUDA_KERNELS})
+    # Load the file contents and process it.
+    FILE(STRINGS ${file} file_content NEWLINE_CONSUME)
+    # Replace all backslashes by double backslashes as they are being put in a C string.
+    # Be careful not to replace the backslash before a semicolon as that is the CMAKE
+    # internal escaping of a semicolon to prevent it from acting as a list seperator.
+    STRING(REGEX REPLACE "\\\\([^;])" "\\\\\\\\\\1" file_content "${file_content}")
+    # Escape double quotes as being put in a C string.
+    STRING(REPLACE "\"" "\\\"" file_content "${file_content}")
+    # Split in separate C strings for each line.
+    STRING(REPLACE "\n" "\\n\"\n\"" file_content "${file_content}")
+    # Determine a name for the variable that will contain this file's contents
+    FILE(RELATIVE_PATH filename ${CUDA_SOURCE_DIR}/kernels ${file})
+    STRING(LENGTH ${filename} filename_length)
+    MATH(EXPR filename_length ${filename_length}-3)
+    STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name)
+    # Record the variable declaration and definition.
+    SET(CUDA_FILE_DECLARATIONS ${CUDA_FILE_DECLARATIONS}static\ const\ std::string\ ${variable_name};\n)
+    FILE(APPEND ${CUDA_KERNELS_CPP} const\ string\ ${CUDA_SOURCE_CLASS}::${variable_name}\ =\ \"${file_content}\"\;\n)
+ENDFOREACH(file)
+CONFIGURE_FILE(${CUDA_SOURCE_DIR}/${CUDA_SOURCE_CLASS}.h.in ${CUDA_KERNELS_H})
--- a/platforms/cuda2/include/CudaKernelFactory.h
+++ b/platforms/cuda2/include/CudaKernelFactory.h
+#ifndef OPENMM_CUDAKERNELFACTORY_H_
+#define OPENMM_CUDAKERNELFACTORY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/KernelFactory.h"
+namespace OpenMM {
+/**
+ * This KernelFactory creates all kernels for CudaPlatform.
+ */
+class CudaKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAKERNELFACTORY_H_*/
--- a/platforms/cuda2/include/CudaPlatform.h
+++ b/platforms/cuda2/include/CudaPlatform.h
+#ifndef OPENMM_CUDAPLATFORM_H_
+#define OPENMM_CUDAPLATFORM_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/Platform.h"
+#include "openmm/System.h"
+namespace OpenMM {
+class CudaContext;
+/**
+ * This Platform subclass uses CUDA implementations of the OpenMM kernels.
+ */
+class OPENMM_EXPORT CudaPlatform : public Platform {
+public:
+    class PlatformData;
+    CudaPlatform();
+    const std::string& getName() const {
+        static const std::string name = "CUDA";
+        return name;
+    }
+    double getSpeed() const {
+        return 100;
+    }
+    bool supportsDoublePrecision() const;
+    const std::string& getPropertyValue(const Context& context, const std::string& property) const;
+    void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
+    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
+    void contextDestroyed(ContextImpl& context) const;
+    /**
+     * This is the name of the parameter for selecting which CUDA device or devices to use.
+     */
+    static const std::string& CudaDeviceIndex() {
+        static const std::string key = "CudaDeviceIndex";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
+     */
+    static const std::string& CudaUseBlockingSync() {
+        static const std::string key = "CudaUseBlockingSync";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for selecting what numerical precision to use.
+     */
+    static const std::string& CudaPrecision() {
+        static const std::string key = "CudaPrecision";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for specifying the path to the CUDA compiler.
+     */
+    static const std::string& CudaCompiler() {
+        static const std::string key = "CudaCompiler";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for specifying the path to the directory for creating temporary files.
+     */
+    static const std::string& CudaTempDirectory() {
+        static const std::string key = "CudaTempDirectory";
+        return key;
+    }
+};
+class OPENMM_EXPORT CudaPlatform::PlatformData {
+public:
+    PlatformData(const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
+            const std::string& compilerProperty, const std::string& tempProperty);
+    ~PlatformData();
+    void initializeContexts(const System& system);
+    void syncContexts();
+    std::vector<CudaContext*> contexts;
+    std::vector<double> contextEnergy;
+    bool removeCM;
+    int cmMotionFrequency;
+    int stepCount, computeForceCount;
+    double time;
+    std::map<std::string, std::string> propertyValues;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAPLATFORM_H_*/
--- a/platforms/cuda2/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda2/sharedTarget/CMakeLists.txt
+#
+# Include CUDA related files.
+#
+INCLUDE(FindCUDA)
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
+ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
+    COMMAND ${CMAKE_COMMAND}
+    ARGS -D CUDA_SOURCE_DIR=${CUDA_SOURCE_DIR} -D CUDA_KERNELS_CPP=${CUDA_KERNELS_CPP} -D CUDA_KERNELS_H=${CUDA_KERNELS_H} -D CUDA_SOURCE_CLASS=${CUDA_SOURCE_CLASS} -P ${CMAKE_CURRENT_SOURCE_DIR}/../EncodeCUDAFiles.cmake
+    DEPENDS ${CUDA_KERNELS}
+)
+SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
+ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
+ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${PTHREADS_LIB})
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
+INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaArray.h"
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace OpenMM;
+CudaArray::CudaArray(int size, int elementSize, const std::string& name) :
+        size(size), elementSize(elementSize), name(name), ownsMemory(true) {
+    CUresult result = cuMemAlloc(&pointer, size*elementSize);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error creating array "<<name<<": "<<result;
+        throw OpenMMException(str.str());
+    }
+}
+CudaArray::~CudaArray() {
+    if (ownsMemory) {
+        CUresult result = cuMemFree(pointer);
+        if (result != CUDA_SUCCESS) {
+            std::stringstream str;
+            str<<"Error deleting array "<<name<<": "<<result;
+            throw OpenMMException(str.str());
+        }
+    }
+}
+void CudaArray::upload(void* data, bool blocking) {
+    CUresult result;
+    if (blocking)
+        result = cuMemcpyHtoD(pointer, data, size*elementSize);
+    else
+        result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, 0);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error uploading array "<<name<<": "<<result;
+        throw OpenMMException(str.str());
+    }
+}
+void CudaArray::download(void* data, bool blocking) const {
+    CUresult result;
+    if (blocking)
+        result = cuMemcpyDtoH(data, pointer, size*elementSize);
+    else
+        result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, 0);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error downloading array "<<name<<": "<<result;
+        throw OpenMMException(str.str());
+    }
+}
--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
+#ifndef OPENMM_CUDAARRAY_H_
+#define OPENMM_CUDAARRAY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/OpenMMException.h"
+#include <cuda.h>
+#include <iostream>
+#include <sstream>
+#include <vector>
+namespace OpenMM {
+/**
+ * This class encapsulates a block of CUDA device memory.  It provides a simplified API
+ * for working with it and for copying data to and from device memory.
+ */
+class CudaArray {
+public:
+    /**
+     * Create a CudaArray object.  The object is allocated on the heap with the "new" operator.
+     * The template argument is the data type of each array element.
+     *
+     * @param size              the number of elements in the array
+     * @param name              the name of the array
+     */
+    template <class T>
+    static CudaArray* create(int size, const std::string& name) {
+        return new CudaArray(size, sizeof(T), name);
+    }
+    /**
+     * Create a CudaArray object.
+     *
+     * @param size              the number of elements in the array
+     * @param elementSize       the size of each element in bytes
+     * @param name              the name of the array
+     */
+    CudaArray(int size, int elementSize, const std::string& name);
+    ~CudaArray();
+    /**
+     * Get the number of elements in the array.
+     */
+    int getSize() const {
+        return size;
+    }
+    /**
+     * Get the size of each element in bytes.
+     */
+    int getElementSize() const {
+        return elementSize;
+    }
+    /**
+     * Get the name of the array.
+     */
+    const std::string& getName() const {
+        return name;
+    }
+    /**
+     * Get a pointer to the device memory.
+     */
+    CUdeviceptr getDevicePointer() {
+        return pointer;
+    }
+    /**
+     * Copy the values in a vector to the device memory.
+     */
+    template <class T>
+    void upload(std::vector<T>& data) {
+        if (sizeof(T) != elementSize || data.size() != size)
+            throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
+        upload(&data[0], true);
+    }
+    /**
+     * Copy the values in the Buffer to a vector.
+     */
+    template <class T>
+    void download(std::vector<T>& data) const {
+        if (sizeof(T) != elementSize)
+            throw OpenMMException("Error downloading array "+name+": The specified vector has the wrong element size");
+        if (data.size() != size)
+            data.resize(size);
+        download(&data[0], true);
+    }
+    /**
+     * Copy the values in an array to the device memory.
+     * 
+     * @param data     the data to copy
+     * @param blocking if true, this call will block until the transfer is complete.  If false,
+     *                 the source array  must be in page-locked memory.
+     */
+    void upload(void* data, bool blocking = true);
+    /**
+     * Copy the values in the device memory to an array.
+     * 
+     * @param data     the array to copy the memory to
+     * @param blocking if true, this call will block until the transfer is complete.  If false,
+     *                 the destination array must be in page-locked memory.
+     */
+    void download(void* data, bool blocking = true) const;
+private:
+    CUdeviceptr pointer;
+    int size, elementSize;
+    bool ownsMemory;
+    std::string name;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAARRAY_H_*/
--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
+#ifndef OPENMM_CUDACONTEXT_H_
+#define OPENMM_CUDACONTEXT_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include <map>
+#include <queue>
+#include <string>
+#include <pthread.h>
+#define __CL_ENABLE_EXCEPTIONS
+#ifdef _MSC_VER
+    // Prevent Windows from defining macros that interfere with other code.
+    #define NOMINMAX
+#endif
+#include <cuda.h>
+#include <builtin_types.h>
+#include <vector_functions.h>
+#include "openmm/internal/windowsExport.h"
+#include "CudaPlatform.h"
+namespace OpenMM {
+class CudaArray;
+class CudaForceInfo;
+class CudaIntegrationUtilities;
+class CudaBondedUtilities;
+class CudaNonbondedUtilities;
+class System;
+/**
+ * This class contains the information associated with a Context by the CUDA Platform.  Each CudaContext is
+ * specific to a particular device, and manages data structures and kernels for that device.  When running a simulation
+ * in parallel on multiple devices, there is a separate CudaContext for each one.  The list of all contexts is
+ * stored in the CudaPlatform::PlatformData.
+ * <p>
+ * In addition, a worker thread is created for each CudaContext.  This is used for parallel computations, so that
+ * blocking calls to one device will not block other devices.  When only a single device is being used, the worker
+ * thread is not used and calculations are performed on the main application thread.
+ */
+class OPENMM_EXPORT CudaContext {
+public:
+    class WorkTask;
+    class WorkThread;
+    class ReorderListener;
+    static const int ThreadBlockSize;
+    static const int TileSize;
+    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
+            const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
+    ~CudaContext();
+//    /**
+//     * This is called to initialize internal data structures after all Forces in the system
+//     * have been initialized.
+//     */
+//    void initialize();
+    /**
+     * Add a CudaForce to this context.
+     */
+    void addForce(CudaForceInfo* force);
+    /**
+     * Get the CUcontext associated with this object.
+     */
+    CUcontext getContext() {
+        return context;
+    }
+    /**
+     * Get the CUdevice associated with this object.
+     */
+    CUdevice getDevice() {
+        return device;
+    }
+    /**
+     * Get the index of the CUdevice associated with this object.
+     */
+    int getDeviceIndex() {
+        return deviceIndex;
+    }
+    /**
+     * Get the PlatformData object this context is part of.
+     */
+    CudaPlatform::PlatformData& getPlatformData() {
+        return platformData;
+    }
+    /**
+     * Get the index of this context in the list stored in the PlatformData.
+     */
+    int getContextIndex() const {
+        return contextIndex;
+    }
+    /**
+     * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
+     */
+    CudaArray& getPosq() {
+        return *posq;
+    }
+    /**
+     * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
+     */
+    CudaArray& getVelm() {
+        return *velm;
+    }
+//    /**
+//     * Get the array which contains the force on each atom.
+//     */
+//    CudaArray<mm_float4>& getForce() {
+//        return *force;
+//    }
+//    /**
+//     * Get the array which contains the buffers in which forces are computed.
+//     */
+//    CudaArray<mm_float4>& getForceBuffers() {
+//        return *forceBuffers;
+//    }
+//    /**
+//     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
+//     */
+//    CudaArray<cl_long>& getLongForceBuffer() {
+//        return *longForceBuffer;
+//    }
+//    /**
+//     * Get the array which contains the buffer in which energy is computed.
+//     */
+//    CudaArray<cl_float>& getEnergyBuffer() {
+//        return *energyBuffer;
+//    }
+//    /**
+//     * Get the array which contains the index of each atom.
+//     */
+//    CudaArray<cl_int>& getAtomIndex() {
+//        return *atomIndex;
+//    }
+//    /**
+//     * Get the number of cells by which the positions are offset.
+//     */
+//    std::vector<mm_int4>& getPosCellOffsets() {
+//        return posCellOffsets;
+//    }
+    /**
+     * Replace all occurrences of a list of substrings.
+     *
+     * @param input   a string to process
+     * @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string
+     * @return a new string produced by performing the replacements
+     */
+    std::string replaceStrings(const std::string& input, const std::map<std::string, std::string>& replacements) const;
+    /**
+     * Create a CUDA module from source code.
+     *
+     * @param source             the source code of the module
+     * @param optimizationFlags  the optimization flags to pass to the CUDA compiler.  If this is
+     *                           omitted, a default set of options will be used
+     */
+    CUmodule createModule(const std::string source, const char* optimizationFlags = NULL);
+    /**
+     * Create a CUDA module from source code.
+     *
+     * @param source             the source code of the module
+     * @param defines            a set of preprocessor definitions (name, value) to define when compiling the program
+     * @param optimizationFlags  the optimization flags to pass to the CUDA compiler.  If this is
+     *                           omitted, a default set of options will be used
+     */
+    CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
+//    /**
+//     * Execute a kernel.
+//     *
+//     * @param kernel       the kernel to execute
+//     * @param workUnits    the maximum number of work units that should be used
+//     * @param blockSize    the size of each thread block to use
+//     */
+//    void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
+//    /**
+//     * Set all elements of an array to 0.
+//     */
+//    void clearBuffer(CudaArray<float>& array);
+//    /**
+//     * Set all elements of an array to 0.
+//     */
+//    void clearBuffer(CudaArray<mm_float4>& array);
+//    /**
+//     * Set all elements of an array to 0.
+//     *
+//     * @param memory     the Memory to clear
+//     * @param size       the number of float elements in the buffer
+//     */
+//    void clearBuffer(cl::Memory& memory, int size);
+//    /**
+//     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
+//     *
+//     * @param memory     the Memory to clear
+//     * @param size       the number of float elements in the buffer
+//     */
+//    void addAutoclearBuffer(cl::Memory& memory, int size);
+//    /**
+//     * Clear all buffers that have been registered with addAutoclearBuffer().
+//     */
+//    void clearAutoclearBuffers();
+//    /**
+//     * Given a collection of buffers packed into an array, sum them and store
+//     * the sum in the first buffer.
+//     *
+//     * @param array       the array containing the buffers to reduce
+//     * @param numBuffers  the number of buffers packed into the array
+//     */
+//    void reduceBuffer(CudaArray<mm_float4>& array, int numBuffers);
+//    /**
+//     * Sum the buffesr containing forces.
+//     */
+//    void reduceForces();
+//    /**
+//     * Get the current simulation time.
+//     */
+//    double getTime() {
+//        return time;
+//    }
+//    /**
+//     * Set the current simulation time.
+//     */
+//    void setTime(double t) {
+//        time = t;
+//    }
+//    /**
+//     * Get the number of integration steps that have been taken.
+//     */
+//    int getStepCount() {
+//        return stepCount;
+//    }
+//    /**
+//     * Set the number of integration steps that have been taken.
+//     */
+//    void setStepCount(int steps) {
+//        stepCount = steps;
+//    }
+//    /**
+//     * Get the number of times forces or energy has been computed.
+//     */
+//    int getComputeForceCount() {
+//        return computeForceCount;
+//    }
+//    /**
+//     * Set the number of times forces or energy has been computed.
+//     */
+//    void setComputeForceCount(int count) {
+//        computeForceCount = count;
+//    }
+//    /**
+//     * Get the number of atoms.
+//     */
+//    int getNumAtoms() const {
+//        return numAtoms;
+//    }
+//    /**
+//     * Get the number of atoms, rounded up to a multiple of TileSize.  This is the actual size of
+//     * most arrays with one element per atom.
+//     */
+//    int getPaddedNumAtoms() const {
+//        return paddedNumAtoms;
+//    }
+//    /**
+//     * Get the number of blocks of TileSize atoms.
+//     */
+//    int getNumAtomBlocks() const {
+//        return numAtomBlocks;
+//    }
+//    /**
+//     * Get the standard number of thread blocks to use when executing kernels.
+//     */
+//    int getNumThreadBlocks() const {
+//        return numThreadBlocks;
+//    }
+//    /**
+//     * Get the number of force buffers.
+//     */
+//    int getNumForceBuffers() const {
+//        return numForceBuffers;
+//    }
+//    /**
+//     * Get the SIMD width of the device being used.
+//     */
+//    int getSIMDWidth() const {
+//        return simdWidth;
+//    }
+//    /**
+//     * Get whether the device being used supports 64 bit atomic operations on global memory.
+//     */
+//    bool getSupports64BitGlobalAtomics() {
+//        return supports64BitGlobalAtomics;
+//    }
+//    /**
+//     * Get whether the device being used supports double precision math.
+//     */
+//    bool getSupportsDoublePrecision() {
+//        return supportsDoublePrecision;
+//    }
+//    /**
+//     * Get the size of the periodic box.
+//     */
+//    mm_float4 getPeriodicBoxSize() const {
+//        return periodicBoxSize;
+//    }
+//    /**
+//     * Set the size of the periodic box.
+//     */
+//    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
+//        periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
+//        invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
+//    }
+//    /**
+//     * Get the inverse of the size of the periodic box.
+//     */
+//    mm_float4 getInvPeriodicBoxSize() const {
+//        return invPeriodicBoxSize;
+//    }
+//    /**
+//     * Get the CudaIntegrationUtilities for this context.
+//     */
+//    CudaIntegrationUtilities& getIntegrationUtilities() {
+//        return *integration;
+//    }
+//    /**
+//     * Get the CudaBondedUtilities for this context.
+//     */
+//    CudaBondedUtilities& getBondedUtilities() {
+//        return *bonded;
+//    }
+//    /**
+//     * Get the CudaNonbondedUtilities for this context.
+//     */
+//    CudaNonbondedUtilities& getNonbondedUtilities() {
+//        return *nonbonded;
+//    }
+//    /**
+//     * Get the thread used by this context for executing parallel computations.
+//     */
+//    WorkThread& getWorkThread() {
+//        return *thread;
+//    }
+//    /**
+//     * Get whether atoms were reordered during the most recent force/energy computation.
+//     */
+//    bool getAtomsWereReordered() const {
+//        return atomsWereReordered;
+//    }
+//    /**
+//     * Set whether atoms were reordered during the most recent force/energy computation.
+//     */
+//    void setAtomsWereReordered(bool wereReordered) {
+//        atomsWereReordered = wereReordered;
+//    }
+//    /**
+//     * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
+//     * together in the arrays.
+//     * 
+//     * @param enforcePeriodic    if true, the atom positions may be altered to enforce periodic boundary conditions
+//     */
+//    void reorderAtoms(bool enforcePeriodic);
+//    /**
+//     * Add a listener that should be called whenever atoms get reordered.  The CudaContext
+//     * assumes ownership of the object, and deletes it when the context itself is deleted.
+//     */
+//    void addReorderListener(ReorderListener* listener);
+//    /**
+//     * Get the list of ReorderListeners.
+//     */
+//    std::vector<ReorderListener*>& getReorderListeners() {
+//        return reorderListeners;
+//    }
+//    /**
+//     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
+//     * This should be called whenever force field parameters change.  It will cause the definitions
+//     * and order to be revalidated the next to reorderAtoms() is called.
+//     */
+//    void invalidateMolecules();
+//    /**
+//     * Get whether the current molecule definitions are valid.
+//     */
+//    bool getMoleculesAreInvalid() {
+//        return moleculesInvalid;
+//    }
+private:
+    struct Molecule;
+    struct MoleculeGroup;
+    class VirtualSiteInfo;
+//    void findMoleculeGroups();
+//    static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
+//    /**
+//     * Ensure that all molecules marked as "identical" really are identical.  This should be
+//     * called whenever force field parameters change.  If necessary, it will rebuild the list
+//     * of molecules and resort the atoms.
+//     */
+//    void validateMolecules();
+    static bool hasInitializedCuda;
+    const System& system;
+    double time;
+    CudaPlatform::PlatformData& platformData;
+    int deviceIndex;
+    int contextIndex;
+    int stepCount;
+    int computeForceCount;
+    int numAtoms;
+    int paddedNumAtoms;
+    int numAtomBlocks;
+    int numThreadBlocks;
+//    int numForceBuffers;
+//    int simdWidth;
+    bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
+    std::string compiler, tempDir, gpuArchitecture;
+    float4 periodicBoxSize;
+    float4 invPeriodicBoxSize;
+    std::string defaultOptimizationOptions;
+    std::map<std::string, std::string> compilationDefines;
+    CUcontext context;
+    CUdevice device;
+    CUfunction clearBufferKernel;
+    CUfunction clearTwoBuffersKernel;
+    CUfunction clearThreeBuffersKernel;
+    CUfunction clearFourBuffersKernel;
+    CUfunction clearFiveBuffersKernel;
+    CUfunction clearSixBuffersKernel;
+    CUfunction reduceFloat4Kernel;
+    CUfunction reduceForcesKernel;
+    std::vector<CudaForceInfo*> forces;
+    std::vector<Molecule> molecules;
+    std::vector<MoleculeGroup> moleculeGroups;
+    std::vector<int4> posCellOffsets;
+    CudaArray* posq;
+    CudaArray* velm;
+//    CudaArray<mm_float4>* force;
+//    CudaArray<mm_float4>* forceBuffers;
+//    CudaArray<cl_long>* longForceBuffer;
+//    CudaArray<cl_float>* energyBuffer;
+//    CudaArray<cl_int>* atomIndex;
+//    std::vector<cl::Memory*> autoclearBuffers;
+//    std::vector<int> autoclearBufferSizes;
+    std::vector<ReorderListener*> reorderListeners;
+//    CudaIntegrationUtilities* integration;
+//    CudaBondedUtilities* bonded;
+//    CudaNonbondedUtilities* nonbonded;
+    WorkThread* thread;
+};
+struct CudaContext::Molecule {
+    std::vector<int> atoms;
+    std::vector<int> constraints;
+    std::vector<std::vector<int> > groups;
+};
+struct CudaContext::MoleculeGroup {
+    std::vector<int> atoms;
+    std::vector<int> instances;
+    std::vector<int> offsets;
+};
+/**
+ * This abstract class defines a task to be executed on the worker thread.
+ */
+class CudaContext::WorkTask {
+public:
+    virtual void execute() = 0;
+    virtual ~WorkTask() {
+    }
+};
+class CudaContext::WorkThread {
+public:
+    struct ThreadData;
+    WorkThread();
+    ~WorkThread();
+    /**
+     * Request that a task be executed on the worker thread.  The argument should have been allocated on the
+     * heap with the "new" operator.  After its execute() method finishes, the object will be deleted automatically.
+     */
+    void addTask(CudaContext::WorkTask* task);
+    /**
+     * Get whether the worker thread is idle, waiting for a task to be added.
+     */
+    bool isWaiting();
+    /**
+     * Get whether the worker thread has exited.
+     */
+    bool isFinished();
+    /**
+     * Block until all tasks have finished executing and the worker thread is idle.
+     */
+    void flush();
+private:
+    std::queue<CudaContext::WorkTask*> tasks;
+    bool waiting, finished;
+    pthread_mutex_t queueLock;
+    pthread_cond_t waitForTaskCondition, queueEmptyCondition;
+    pthread_t thread;
+};
+/**
+ * This abstract class defines a function to be executed whenever atoms get reordered.
+ * Objects that need to know when reordering happens should create a reorderListener
+ * and register it by calling addReorderListener().
+ */
+class CudaContext::ReorderListener {
+public:
+    virtual void execute() = 0;
+    virtual ~ReorderListener() {
+    }
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDACONTEXT_H_*/
--- a/platforms/cuda2/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda2/src/CudaExpressionUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaExpressionUtilities.h"
+#include "openmm/OpenMMException.h"
+#include "openmm/internal/SplineFitter.h"
+#include "lepton/Operation.h"
+using namespace OpenMM;
+using namespace Lepton;
+using namespace std;
+string CudaExpressionUtilities::doubleToString(double value) {
+    stringstream s;
+    s.precision(8);
+    s << scientific << value << "f";
+    return s.str();
+}
+string CudaExpressionUtilities::intToString(int value) {
+    stringstream s;
+    s << value;
+    return s.str();
+}
+string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
+        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
+    vector<pair<ExpressionTreeNode, string> > variableNodes;
+    for (map<string, string>::const_iterator iter = variables.begin(); iter != variables.end(); ++iter)
+        variableNodes.push_back(make_pair(ExpressionTreeNode(new Operation::Variable(iter->first)), iter->second));
+    return createExpressions(expressions, variableNodes, functions, prefix, functionParams, tempType);
+}
+string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const vector<pair<ExpressionTreeNode, string> >& variables,
+        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
+    stringstream out;
+    vector<ParsedExpression> allExpressions;
+    for (map<string, ParsedExpression>::const_iterator iter = expressions.begin(); iter != expressions.end(); ++iter)
+        allExpressions.push_back(iter->second);
+    vector<pair<ExpressionTreeNode, string> > temps = variables;
+    for (map<string, ParsedExpression>::const_iterator iter = expressions.begin(); iter != expressions.end(); ++iter) {
+        processExpression(out, iter->second.getRootNode(), temps, functions, prefix, functionParams, allExpressions, tempType);
+        out << iter->first << getTempName(iter->second.getRootNode(), temps) << ";\n";
+    }
+    return out.str();
+}
+void CudaExpressionUtilities::processExpression(stringstream& out, const ExpressionTreeNode& node, vector<pair<ExpressionTreeNode, string> >& temps,
+        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const vector<ParsedExpression>& allExpressions, const string& tempType) {
+    for (int i = 0; i < (int) temps.size(); i++)
+        if (temps[i].first == node)
+            return;
+    for (int i = 0; i < (int) node.getChildren().size(); i++)
+        processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
+    string name = prefix+intToString(temps.size());
+    bool hasRecordedNode = false;
+    out << tempType << " " << name << " = ";
+    switch (node.getOperation().getId()) {
+        case Operation::CONSTANT:
+            out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
+            break;
+        case Operation::VARIABLE:
+            throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
+        case Operation::CUSTOM:
+        {
+            int i;
+            for (i = 0; i < (int) functions.size() && functions[i].first != node.getOperation().getName(); i++)
+                ;
+            if (i == functions.size())
+                throw OpenMMException("Unknown function in expression: "+node.getOperation().getName());
+            bool isDeriv = (dynamic_cast<const Operation::Custom*>(&node.getOperation())->getDerivOrder()[0] == 1);
+            out << "0.0f;\n";
+            temps.push_back(make_pair(node, name));
+            hasRecordedNode = true;
+            // If both the value and derivative of the function are needed, it's faster to calculate them both
+            // at once, so check to see if both are needed.
+            const ExpressionTreeNode* valueNode = NULL;
+            const ExpressionTreeNode* derivNode = NULL;
+            for (int j = 0; j < (int) allExpressions.size(); j++)
+                findRelatedTabulatedFunctions(node, allExpressions[j].getRootNode(), valueNode, derivNode);
+            string valueName = name;
+            string derivName = name;
+            if (valueNode != NULL && derivNode != NULL) {
+                string name2 = prefix+intToString(temps.size());
+                out << tempType << " " << name2 << " = 0.0f;\n";
+                if (isDeriv) {
+                    valueName = name2;
+                    temps.push_back(make_pair(*valueNode, name2));
+                }
+                else {
+                    derivName = name2;
+                    temps.push_back(make_pair(*derivNode, name2));
+                }
+            }
+            out << "{\n";
+            out << "float4 params = " << functionParams << "[" << i << "];\n";
+            out << "float x = " << getTempName(node.getChildren()[0], temps) << ";\n";
+            out << "if (x >= params.x && x <= params.y) {\n";
+            out << "x = (x-params.x)*params.z;\n";
+            out << "int index = (int) (floor(x));\n";
+            out << "index = min(index, (int) params.w);\n";
+            out << "float4 coeff = " << functions[i].second << "[index];\n";
+            out << "float b = x-index;\n";
+            out << "float a = 1.0f-b;\n";
+            if (valueNode != NULL)
+                out << valueName << " = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);\n";
+            if (derivNode != NULL)
+                out << derivName << " = (coeff.y-coeff.x)*params.z+((1.0f-3.0f*a*a)*coeff.z+(3.0f*b*b-1.0f)*coeff.w)/params.z;\n";
+            out << "}\n";
+            out << "}";
+            break;
+        }
+        case Operation::ADD:
+            out << getTempName(node.getChildren()[0], temps) << "+" << getTempName(node.getChildren()[1], temps);
+            break;
+        case Operation::SUBTRACT:
+            out << getTempName(node.getChildren()[0], temps) << "-" << getTempName(node.getChildren()[1], temps);
+            break;
+        case Operation::MULTIPLY:
+            out << getTempName(node.getChildren()[0], temps) << "*" << getTempName(node.getChildren()[1], temps);
+            break;
+        case Operation::DIVIDE:
+        {
+            bool haveReciprocal = false;
+            for (int i = 0; i < (int) temps.size(); i++)
+                if (temps[i].first.getOperation().getId() == Operation::RECIPROCAL && temps[i].first.getChildren()[0] == node.getChildren()[1]) {
+                    haveReciprocal = true;
+                    out << getTempName(node.getChildren()[0], temps) << "*" << temps[i].second;
+                }
+            if (!haveReciprocal)
+                out << getTempName(node.getChildren()[0], temps) << "/" << getTempName(node.getChildren()[1], temps);
+            break;
+        }
+        case Operation::POWER:
+            out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            break;
+        case Operation::NEGATE:
+            out << "-" << getTempName(node.getChildren()[0], temps);
+            break;
+        case Operation::SQRT:
+            out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::EXP:
+            out << "EXP(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::LOG:
+            out << "LOG(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::SIN:
+            out << "sin(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::COS:
+            out << "cos(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::SEC:
+            out << "1.0f/cos(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::CSC:
+            out << "1.0f/sin(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::TAN:
+            out << "tan(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::COT:
+            out << "1.0f/tan(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ASIN:
+            out << "asin(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ACOS:
+            out << "acos(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ATAN:
+            out << "atan(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::SINH:
+            out << "sinh(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::COSH:
+            out << "cosh(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::TANH:
+            out << "tanh(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ERF:
+            out << "erf(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ERFC:
+            out << "erfc(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::STEP:
+            out << getTempName(node.getChildren()[0], temps) << " >= 0.0f ? 1.0f : 0.0f";
+            break;
+        case Operation::DELTA:
+            out << getTempName(node.getChildren()[0], temps) << " == 0.0f ? 1.0f : 0.0f";
+            break;
+        case Operation::SQUARE:
+        {
+            string arg = getTempName(node.getChildren()[0], temps);
+            out << arg << "*" << arg;
+            break;
+        }
+        case Operation::CUBE:
+        {
+            string arg = getTempName(node.getChildren()[0], temps);
+            out << arg << "*" << arg << "*" << arg;
+            break;
+        }
+        case Operation::RECIPROCAL:
+            out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        case Operation::ADD_CONSTANT:
+            out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
+            break;
+        case Operation::MULTIPLY_CONSTANT:
+            out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
+            break;
+        case Operation::POWER_CONSTANT:
+        {
+            double exponent = dynamic_cast<const Operation::PowerConstant*>(&node.getOperation())->getValue();
+            if (exponent == 0.0)
+                out << "1.0f";
+            else if (exponent == (int) exponent) {
+                out << "0.0f;\n";
+                temps.push_back(make_pair(node, name));
+                hasRecordedNode = true;
+                // If multiple integral powers of the same base are needed, it's faster to calculate all of them
+                // at once, so check to see if others are also needed.
+                map<int, const ExpressionTreeNode*> powers;
+                powers[(int) exponent] = &node;
+                for (int j = 0; j < (int) allExpressions.size(); j++)
+                    findRelatedPowers(node, allExpressions[j].getRootNode(), powers);
+                vector<int> exponents;
+                vector<string> names;
+                vector<bool> hasAssigned(powers.size(), false);
+                exponents.push_back((int) fabs(exponent));
+                names.push_back(name);
+                for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
+                    if (iter->first != exponent) {
+                        exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
+                        string name2 = prefix+intToString(temps.size());
+                        names.push_back(name2);
+                        temps.push_back(make_pair(*iter->second, name2));
+                        out << tempType << " " << name2 << " = 0.0f;\n";
+                    }
+                }
+                out << "{\n";
+                out << "float multiplier = " << (exponent < 0.0 ? "1.0f/" : "") << getTempName(node.getChildren()[0], temps) << ";\n";
+                bool done = false;
+                while (!done) {
+                    done = true;
+                    for (int i = 0; i < (int) exponents.size(); i++) {
+                        if (exponents[i]%2 == 1) {
+                            if (!hasAssigned[i])
+                                out << names[i] << " = multiplier;\n";
+                            else
+                                out << names[i] << " *= multiplier;\n";
+                            hasAssigned[i] = true;
+                        }
+                        exponents[i] >>= 1;
+                        if (exponents[i] != 0)
+                            done = false;
+                    }
+                    if (!done)
+                        out << "multiplier *= multiplier;\n";
+                }
+                out << "}";
+            }
+            else
+                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")";
+            break;
+        }
+        case Operation::MIN:
+            out << "min(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            break;
+        case Operation::MAX:
+            out << "max(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            break;
+        case Operation::ABS:
+            out << "fabs(" << getTempName(node.getChildren()[0], temps) << ")";
+            break;
+        default:
+            throw OpenMMException("Internal error: Unknown operation in user-defined expression: "+node.getOperation().getName());
+    }
+    out << ";\n";
+    if (!hasRecordedNode)
+        temps.push_back(make_pair(node, name));
+}
+string CudaExpressionUtilities::getTempName(const ExpressionTreeNode& node, const vector<pair<ExpressionTreeNode, string> >& temps) {
+    for (int i = 0; i < (int) temps.size(); i++)
+        if (temps[i].first == node)
+            return temps[i].second;
+    stringstream out;
+    out << "Internal error: No temporary variable for expression node: " << node;
+    throw OpenMMException(out.str());
+}
+void CudaExpressionUtilities::findRelatedTabulatedFunctions(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode,
+            const ExpressionTreeNode*& valueNode, const ExpressionTreeNode*& derivNode) {
+    if (searchNode.getOperation().getId() == Operation::CUSTOM && node.getChildren()[0] == searchNode.getChildren()[0]) {
+        if (dynamic_cast<const Operation::Custom*>(&searchNode.getOperation())->getDerivOrder()[0] == 0)
+            valueNode = &searchNode;
+        else
+            derivNode = &searchNode;
+    }
+    else
+        for (int i = 0; i < (int) searchNode.getChildren().size(); i++)
+            findRelatedTabulatedFunctions(node, searchNode.getChildren()[i], valueNode, derivNode);
+}
+void CudaExpressionUtilities::findRelatedPowers(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode, map<int, const ExpressionTreeNode*>& powers) {
+    if (searchNode.getOperation().getId() == Operation::POWER_CONSTANT && node.getChildren()[0] == searchNode.getChildren()[0]) {
+        double realPower = dynamic_cast<const Operation::PowerConstant*>(&searchNode.getOperation())->getValue();
+        int power = (int) realPower;
+        if (power != realPower)
+            return; // We are only interested in integer powers.
+        if (powers.find(power) != powers.end())
+            return; // This power is already in the map.
+        if (powers.begin()->first*power < 0)
+            return; // All powers must have the same sign.
+        powers[power] = &searchNode;
+    }
+    else
+        for (int i = 0; i < (int) searchNode.getChildren().size(); i++)
+            findRelatedPowers(node, searchNode.getChildren()[i], powers);
+}
+vector<float4> CudaExpressionUtilities::computeFunctionCoefficients(const vector<double>& values, double min, double max) {
+    // Compute the spline coefficients.
+    int numValues = values.size();
+    vector<double> x(numValues), derivs;
+    for (int i = 0; i < numValues; i++)
+        x[i] = min+i*(max-min)/(numValues-1);
+    SplineFitter::createNaturalSpline(x, values, derivs);
+    vector<float4> f(numValues-1);
+    for (int i = 0; i < (int) values.size()-1; i++)
+        f[i] = make_float4((float) values[i], (float) values[i+1], (float) (derivs[i]/6.0), (float) (derivs[i+1]/6.0));
+    return f;
+}
--- a/platforms/cuda2/src/CudaExpressionUtilities.h
+++ b/platforms/cuda2/src/CudaExpressionUtilities.h
+#ifndef OPENMM_CUDAEXPRESSIONUTILITIES_H_
+#define OPENMM_CUDAEXPRESSIONUTILITIES_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaContext.h"
+#include "lepton/CustomFunction.h"
+#include "lepton/ExpressionTreeNode.h"
+#include "lepton/ParsedExpression.h"
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+namespace OpenMM {
+/**
+ * This class is used by various classes to generate CUDA source code implementing
+ * user defined mathematical expressions.
+ */
+class OPENMM_EXPORT CudaExpressionUtilities {
+public:
+    /**
+     * Generate the source code for calculating a set of expressions.
+     *
+     * @param expressions    the expressions to generate code for (keys are the variables to store the output values in)
+     * @param variables      defines the source code to generate for each variable that may appear in the expressions.  Keys are
+     *                       variable names, and the values are the code to generate for them.
+     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
+     * @param prefix         a prefix to put in front of temporary variables
+     * @param functionParams the variable name containing the parameters for each tabulated function
+     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     */
+    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+    /**
+     * Generate the source code for calculating a set of expressions.
+     *
+     * @param expressions    the expressions to generate code for (keys are the variables to store the output values in)
+     * @param variables      defines the source code to generate for each variable or precomputed sub-expression that may appear in the expressions.
+     *                       Each entry is an ExpressionTreeNode, and the code to generate wherever an identical node appears.
+     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
+     * @param prefix         a prefix to put in front of temporary variables
+     * @param functionParams the variable name containing the parameters for each tabulated function
+     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     */
+    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+    /**
+     * Calculate the spline coefficients for a tabulated function that appears in expressions.
+     *
+     * @param values         the tabulated values of the function
+     * @param min            the value of the independent variable corresponding to the first element of values
+     * @param max            the value of the independent variable corresponding to the last element of values
+     * @return the spline coefficients
+     */
+    static std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     */
+    static std::string doubleToString(double value);
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     */
+    static std::string intToString(int value);
+    class FunctionPlaceholder;
+private:
+    static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
+            std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
+            const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
+    static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
+    static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+            const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
+    static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+            std::map<int, const Lepton::ExpressionTreeNode*>& powers);
+};
+/**
+ * This class serves as a placeholder for custom functions in expressions.
+ */
+class CudaExpressionUtilities::FunctionPlaceholder : public Lepton::CustomFunction {
+public:
+    int getNumArguments() const {
+        return 1;
+    }
+    double evaluate(const double* arguments) const {
+        return 0.0;
+    }
+    double evaluateDerivative(const double* arguments, const int* derivOrder) const {
+        return 0.0;
+    }
+    CustomFunction* clone() const {
+        return new FunctionPlaceholder();
+    }
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAEXPRESSIONUTILITIES_H_*/
--- a/platforms/cuda2/src/CudaForceInfo.cpp
+++ b/platforms/cuda2/src/CudaForceInfo.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaForceInfo.h"
+using namespace OpenMM;
+using namespace std;
+bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
+    return true;
+}
+int CudaForceInfo::getNumParticleGroups() {
+    return 0;
+}
+void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
+    return;
+}
+bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
+    return true;
+}
--- a/platforms/cuda2/src/CudaForceInfo.h
+++ b/platforms/cuda2/src/CudaForceInfo.h
+#ifndef OPENMM_CUDAFORCEINFO_H_
+#define OPENMM_CUDAFORCEINFO_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/internal/windowsExport.h"
+#include <vector>
+namespace OpenMM {
+/**
+ * This class is used by the Cuda implementation of a Force class to convey information
+ * about the behavior and requirements of that force.
+ */
+class OPENMM_EXPORT CudaForceInfo {
+public:
+    CudaForceInfo(int requiredForceBuffers) : requiredForceBuffers(requiredForceBuffers) {
+    }
+    /**
+     * Get the number of force buffers this force requires.
+     */
+    int getRequiredForceBuffers() {
+        return requiredForceBuffers;
+    }
+    /**
+     * Get whether or not two particles have identical force field parameters.
+     */
+    virtual bool areParticlesIdentical(int particle1, int particle2);
+    /**
+     * Get the number of particle groups defined by this force.
+     */
+    virtual int getNumParticleGroups();
+    /**
+     * Get the list of particles in a particular group.
+     */
+    virtual void getParticlesInGroup(int index, std::vector<int>& particles);
+    /**
+     * Get whether two particle groups are identical.
+     */
+    virtual bool areGroupsIdentical(int group1, int group2);
+private:
+    int requiredForceBuffers;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAFORCEINFO_H_*/
--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaKernelFactory.h"
+//#include "CudaParallelKernels.h"
+#include "CudaPlatform.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+using namespace OpenMM;
+KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
+//    if (data.contexts.size() > 1) {
+//        // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
+//        
+//        if (name == CalcForcesAndEnergyKernel::Name())
+//            return new CudaParallelCalcForcesAndEnergyKernel(name, platform, data);
+//        if (name == CalcHarmonicBondForceKernel::Name())
+//            return new CudaParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomBondForceKernel::Name())
+//            return new CudaParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcHarmonicAngleForceKernel::Name())
+//            return new CudaParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomAngleForceKernel::Name())
+//            return new CudaParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcPeriodicTorsionForceKernel::Name())
+//            return new CudaParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcRBTorsionForceKernel::Name())
+//            return new CudaParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCMAPTorsionForceKernel::Name())
+//            return new CudaParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomTorsionForceKernel::Name())
+//            return new CudaParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcNonbondedForceKernel::Name())
+//            return new CudaParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomNonbondedForceKernel::Name())
+//            return new CudaParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomExternalForceKernel::Name())
+//            return new CudaParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomHbondForceKernel::Name())
+//            return new CudaParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
+//        if (name == CalcCustomCompoundBondForceKernel::Name())
+//            return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
+//    }
+//    CudaContext& cl = *data.contexts[0];
+//    if (name == CalcForcesAndEnergyKernel::Name())
+//        return new CudaCalcForcesAndEnergyKernel(name, platform, cl);
+//    if (name == UpdateStateDataKernel::Name())
+//        return new CudaUpdateStateDataKernel(name, platform, cl);
+//    if (name == ApplyConstraintsKernel::Name())
+//        return new CudaApplyConstraintsKernel(name, platform, cl);
+//    if (name == VirtualSitesKernel::Name())
+//        return new CudaVirtualSitesKernel(name, platform, cl);
+//    if (name == CalcHarmonicBondForceKernel::Name())
+//        return new CudaCalcHarmonicBondForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomBondForceKernel::Name())
+//        return new CudaCalcCustomBondForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcHarmonicAngleForceKernel::Name())
+//        return new CudaCalcHarmonicAngleForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomAngleForceKernel::Name())
+//        return new CudaCalcCustomAngleForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcPeriodicTorsionForceKernel::Name())
+//        return new CudaCalcPeriodicTorsionForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcRBTorsionForceKernel::Name())
+//        return new CudaCalcRBTorsionForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCMAPTorsionForceKernel::Name())
+//        return new CudaCalcCMAPTorsionForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomTorsionForceKernel::Name())
+//        return new CudaCalcCustomTorsionForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcNonbondedForceKernel::Name())
+//        return new CudaCalcNonbondedForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomNonbondedForceKernel::Name())
+//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcGBSAOBCForceKernel::Name())
+//        return new CudaCalcGBSAOBCForceKernel(name, platform, cl);
+//    if (name == CalcCustomGBForceKernel::Name())
+//        return new CudaCalcCustomGBForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomExternalForceKernel::Name())
+//        return new CudaCalcCustomExternalForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomHbondForceKernel::Name())
+//        return new CudaCalcCustomHbondForceKernel(name, platform, cl, context.getSystem());
+//    if (name == CalcCustomCompoundBondForceKernel::Name())
+//        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cl, context.getSystem());
+//    if (name == IntegrateVerletStepKernel::Name())
+//        return new CudaIntegrateVerletStepKernel(name, platform, cl);
+//    if (name == IntegrateLangevinStepKernel::Name())
+//        return new CudaIntegrateLangevinStepKernel(name, platform, cl);
+//    if (name == IntegrateBrownianStepKernel::Name())
+//        return new CudaIntegrateBrownianStepKernel(name, platform, cl);
+//    if (name == IntegrateVariableVerletStepKernel::Name())
+//        return new CudaIntegrateVariableVerletStepKernel(name, platform, cl);
+//    if (name == IntegrateVariableLangevinStepKernel::Name())
+//        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cl);
+//    if (name == IntegrateCustomStepKernel::Name())
+//        return new CudaIntegrateCustomStepKernel(name, platform, cl);
+//    if (name == ApplyAndersenThermostatKernel::Name())
+//        return new CudaApplyAndersenThermostatKernel(name, platform, cl);
+//    if (name == ApplyMonteCarloBarostatKernel::Name())
+//        return new CudaApplyMonteCarloBarostatKernel(name, platform, cl);
+//    if (name == CalcKineticEnergyKernel::Name())
+//        return new CudaCalcKineticEnergyKernel(name, platform, cl);
+//    if (name == RemoveCMMotionKernel::Name())
+//        return new CudaRemoveCMMotionKernel(name, platform, cl);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/platforms/cuda2/src/CudaKernelSources.cpp.in
+++ b/platforms/cuda2/src/CudaKernelSources.cpp.in
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaKernelSources.h"
+using namespace OpenMM;
+using namespace std;
--- a/platforms/cuda2/src/CudaKernelSources.h.in
+++ b/platforms/cuda2/src/CudaKernelSources.h.in
+#ifndef OPENMM_CUDAKERNELSOURCES_H_
+#define OPENMM_CUDAKERNELSOURCES_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/internal/windowsExport.h"
+#include <string>
+namespace OpenMM {
+/**
+ * This class is a central holding place for the source code of CUDA kernels.
+ * The CMake build script inserts declarations into it based on the .cu files in the
+ * kernels subfolder.
+ */
+class OPENMM_EXPORT CudaKernelSources {
+public:
+@CUDA_FILE_DECLARATIONS@
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAKERNELSOURCES_H_*/
--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
--- a/platforms/cuda2/src/CudaPlatform.cpp
+++ b/platforms/cuda2/src/CudaPlatform.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaContext.h"
+#include "CudaExpressionUtilities.h"
+#include "CudaPlatform.h"
+#include "CudaKernelFactory.h"
+#include "CudaKernels.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/Context.h"
+#include "openmm/System.h"
+#include <algorithm>
+#include <cctype>
+#include <sstream>
+using namespace OpenMM;
+using namespace std;
+extern "C" OPENMM_EXPORT void registerPlatforms() {
+    Platform::registerPlatform(new CudaPlatform());
+}
+CudaPlatform::CudaPlatform() {
+    CudaKernelFactory* factory = new CudaKernelFactory();
+    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
+    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
+    registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
+    registerKernelFactory(VirtualSitesKernel::Name(), factory);
+    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
+    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
+    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
+    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
+    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
+    registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
+    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
+    platformProperties.push_back(CudaDeviceIndex());
+    platformProperties.push_back(CudaUseBlockingSync());
+    platformProperties.push_back(CudaPrecision());
+    platformProperties.push_back(CudaCompiler());
+    platformProperties.push_back(CudaTempDirectory());
+    setPropertyDefaultValue(CudaDeviceIndex(), "");
+    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
+    setPropertyDefaultValue(CudaPrecision(), "single");
+#ifdef _MSC_VER
+    setPropertyDefaultValue(CudaCompiler(), "nvcc");
+    setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TEMP")));
+#else
+    setPropertyDefaultValue(CudaCompiler(), "/usr/local/cuda/bin/nvcc");
+    setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TMPDIR")));
+#endif
+}
+bool CudaPlatform::supportsDoublePrecision() const {
+    return false;
+}
+const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
+    const ContextImpl& impl = getContextImpl(context);
+    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
+    map<string, string>::const_iterator value = data->propertyValues.find(property);
+    if (value != data->propertyValues.end())
+        return value->second;
+    return Platform::getPropertyValue(context, property);
+}
+void CudaPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
+}
+void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
+    const string& devicePropValue = (properties.find(CudaDeviceIndex()) == properties.end() ?
+            getPropertyDefaultValue(CudaDeviceIndex()) : properties.find(CudaDeviceIndex())->second);
+    string blockingPropValue = (properties.find(CudaUseBlockingSync()) == properties.end() ?
+            getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
+    string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ?
+            getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second);
+    const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ?
+            getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
+    const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
+            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
+    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
+    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
+    context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
+}
+void CudaPlatform::contextDestroyed(ContextImpl& context) const {
+    PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
+    delete data;
+}
+CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
+            const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+    bool blocking = (blockingProperty == "true");
+    vector<string> devices;
+    size_t searchPos = 0, nextPos;
+    while ((nextPos = deviceIndexProperty.find_first_of(", ", searchPos)) != string::npos) {
+        devices.push_back(deviceIndexProperty.substr(searchPos, nextPos-searchPos));
+        searchPos = nextPos+1;
+    }
+    devices.push_back(deviceIndexProperty.substr(searchPos));
+    for (int i = 0; i < (int) devices.size(); i++) {
+        if (devices[i].length() > 0) {
+            unsigned int deviceIndex;
+            stringstream(devices[i]) >> deviceIndex;
+            contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, *this));
+        }
+    }
+    if (contexts.size() == 0)
+        contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, *this));
+    stringstream device;
+    for (int i = 0; i < (int) contexts.size(); i++) {
+        if (i > 0)
+            device << ',';
+        device << contexts[i]->getDeviceIndex();
+    }
+    propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str();
+    propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
+    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
+    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
+    contextEnergy.resize(contexts.size());
+}
+CudaPlatform::PlatformData::~PlatformData() {
+    for (int i = 0; i < (int) contexts.size(); i++)
+        delete contexts[i];
+}
+void CudaPlatform::PlatformData::initializeContexts(const System& system) {
+//    for (int i = 0; i < (int) contexts.size(); i++)
+//        contexts[i]->initialize();
+}
+void CudaPlatform::PlatformData::syncContexts() {
+//    for (int i = 0; i < (int) contexts.size(); i++)
+//        contexts[i]->getWorkThread().flush();
+}
--- a/platforms/cuda2/src/kernels/utilities.cu
+++ b/platforms/cuda2/src/kernels/utilities.cu
+/**
+ * This is called by the various functions below to clear a buffer.
+ */
+__device__ void clearSingleBuffer(int* __restrict__ buffer, int size) {
+    int index = blockDim.x*blockIdx.x+threadIdx.x;
+    int4* buffer4 = (int4*) buffer;
+    int sizeDiv4 = size/4;
+    while (index < sizeDiv4) {
+        buffer4[index] = make_int4(0);
+        index += blockDim.x*gridDim.x;
+    }
+    if (blockDim.x*blockIdx.x+threadIdx.x == 0)
+        for (int i = sizeDiv4*4; i < size; i++)
+            buffer[i] = 0;
+}
+/**
+ * Fill a buffer with 0.
+ */
+__global__ void clearBuffer(int* __restrict__ buffer, int size) {
+    clearSingleBuffer(buffer, size);
+}
+/**
+ * Fill two buffers with 0.
+ */
+__global__ void clearTwoBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+}
+/**
+ * Fill three buffers with 0.
+ */
+__global__ void clearThreeBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+}
+/**
+ * Fill four buffers with 0.
+ */
+__global__ void clearFourBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+}
+/**
+ * Fill five buffers with 0.
+ */
+__global__ void clearFiveBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+    clearSingleBuffer(buffer5, size5);
+}
+/**
+ * Fill six buffers with 0.
+ */
+__global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5, int* __restrict__ buffer6, int size6) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+    clearSingleBuffer(buffer5, size5);
+    clearSingleBuffer(buffer6, size6);
+}
+/**
+ * Sum a collection of buffers into the first one.
+ */
+__global__ void reduceFloat4Buffer(float4* __restrict__ buffer, int bufferSize, int numBuffers) {
+    int index = blockDim.x*blockIdx.x+threadIdx.x;
+    int totalSize = bufferSize*numBuffers;
+    while (index < bufferSize) {
+        float4 sum = buffer[index];
+        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        buffer[index] = sum;
+        index += blockDim.x*gridDim.x;
+    }
+}
+/**
+ * Sum the various buffers containing forces.
+ */
+__global__ void reduceForces(const long* __restrict__ longBuffer, float4* __restrict__ buffer, int bufferSize, int numBuffers) {
+    int totalSize = bufferSize*numBuffers;
+    float scale = 1.0f/(float) 0xFFFFFFFF;
+    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
+        float4 sum = make_float4(scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
+        for (int i = index; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        buffer[index] = sum;
+    }
+}
--- a/platforms/cuda2/src/kernels/vectorOps.cu
+++ b/platforms/cuda2/src/kernels/vectorOps.cu
+/**
+ * This file defines vector operations to simplify code elsewhere.
+ */
+// Versions of make_x() that take a single value and set all components to that.
+inline __device__ int2 make_int2(int a) {
+    return make_int2(a, a);
+}
+inline __device__ int3 make_int3(int a) {
+    return make_int3(a, a, a);
+}
+inline __device__ int4 make_int4(int a) {
+    return make_int4(a, a, a, a);
+}
+inline __device__ float2 make_float2(float a) {
+    return make_float2(a, a);
+}
+inline __device__ float3 make_float3(float a) {
+    return make_float3(a, a, a);
+}
+inline __device__ float4 make_float4(float a) {
+    return make_float4(a, a, a, a);
+}
+inline __device__ double2 make_double2(double a) {
+    return make_double2(a, a);
+}
+inline __device__ double3 make_double3(double a) {
+    return make_double3(a, a, a);
+}
+inline __device__ double4 make_double4(double a) {
+    return make_double4(a, a, a, a);
+}
+// Negate a vector.
+inline __device__ int2 operator*(int2 a) {
+    return make_int2(-a.x, -a.y);
+}
+inline __device__ int3 operator-(int3 a) {
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __device__ int4 operator-(int4 a) {
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __device__ float2 operator-(float2 a) {
+    return make_float2(-a.x, -a.y);
+}
+inline __device__ float3 operator-(float3 a) {
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __device__ float4 operator-(float4 a) {
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __device__ double2 operator-(double2 a) {
+    return make_double2(-a.x, -a.y);
+}
+inline __device__ double3 operator-(double3 a) {
+    return make_double3(-a.x, -a.y, -a.z);
+}
+inline __device__ double4 operator-(double4 a) {
+    return make_double4(-a.x, -a.y, -a.z, -a.w);
+}
+// Add two vectors.
+inline __device__ int2 operator+(int2 a, int2 b) {
+    return make_int2(a.x+b.x, a.y+b.y);
+}
+inline __device__ int3 operator+(int3 a, int3 b) {
+    return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+inline __device__ int4 operator+(int4 a, int4 b) {
+    return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+inline __device__ float2 operator+(float2 a, float2 b) {
+    return make_float2(a.x+b.x, a.y+b.y);
+}
+inline __device__ float3 operator+(float3 a, float3 b) {
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+inline __device__ float4 operator+(float4 a, float4 b) {
+    return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+inline __device__ double2 operator+(double2 a, double2 b) {
+    return make_double2(a.x+b.x, a.y+b.y);
+}
+inline __device__ double3 operator+(double3 a, double3 b) {
+    return make_double3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+inline __device__ double4 operator+(double4 a, double4 b) {
+    return make_double4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+// Subtract two vectors.
+inline __device__ int2 operator-(int2 a, int2 b) {
+    return make_int2(a.x-b.x, a.y-b.y);
+}
+inline __device__ int3 operator-(int3 a, int3 b) {
+    return make_int3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+inline __device__ int4 operator-(int4 a, int4 b) {
+    return make_int4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+inline __device__ float2 operator-(float2 a, float2 b) {
+    return make_float2(a.x-b.x, a.y-b.y);
+}
+inline __device__ float3 operator-(float3 a, float3 b) {
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+inline __device__ float4 operator-(float4 a, float4 b) {
+    return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+inline __device__ double2 operator-(double2 a, double2 b) {
+    return make_double2(a.x-b.x, a.y-b.y);
+}
+inline __device__ double3 operator-(double3 a, double3 b) {
+    return make_double3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+inline __device__ double4 operator-(double4 a, double4 b) {
+    return make_double4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+// Multiply two vectors.
+inline __device__ int2 operator*(int2 a, int2 b) {
+    return make_int2(a.x*b.x, a.y*b.y);
+}
+inline __device__ int3 operator*(int3 a, int3 b) {
+    return make_int3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+inline __device__ int4 operator*(int4 a, int4 b) {
+    return make_int4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+inline __device__ float2 operator*(float2 a, float2 b) {
+    return make_float2(a.x*b.x, a.y*b.y);
+}
+inline __device__ float3 operator*(float3 a, float3 b) {
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+inline __device__ float4 operator*(float4 a, float4 b) {
+    return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+inline __device__ double2 operator*(double2 a, double2 b) {
+    return make_double2(a.x*b.x, a.y*b.y);
+}
+inline __device__ double3 operator*(double3 a, double3 b) {
+    return make_double3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+inline __device__ double4 operator*(double4 a, double4 b) {
+    return make_double4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+// Divide two vectors.
+inline __device__ int2 operator/(int2 a, int2 b) {
+    return make_int2(a.x/b.x, a.y/b.y);
+}
+inline __device__ int3 operator/(int3 a, int3 b) {
+    return make_int3(a.x/b.x, a.y/b.y, a.z/b.z);
+}
+inline __device__ int4 operator/(int4 a, int4 b) {
+    return make_int4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+inline __device__ float2 operator/(float2 a, float2 b) {
+    return make_float2(a.x/b.x, a.y/b.y);
+}
+inline __device__ float3 operator/(float3 a, float3 b) {
+    return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+}
+inline __device__ float4 operator/(float4 a, float4 b) {
+    return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+inline __device__ double2 operator/(double2 a, double2 b) {
+    return make_double2(a.x/b.x, a.y/b.y);
+}
+inline __device__ double3 operator/(double3 a, double3 b) {
+    return make_double3(a.x/b.x, a.y/b.y, a.z/b.z);
+}
+inline __device__ double4 operator/(double4 a, double4 b) {
+    return make_double4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+// += operator
+inline __device__ void operator+=(int2& a, int2 b) {
+    a.x += b.x; a.y += b.y;
+}
+inline __device__ void operator+=(int3& a, int3 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __device__ void operator+=(int4& a, int4 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+inline __device__ void operator+=(float2& a, float2 b) {
+    a.x += b.x; a.y += b.y;
+}
+inline __device__ void operator+=(float3& a, float3 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __device__ void operator+=(float4& a, float4 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+inline __device__ void operator+=(double2& a, double2 b) {
+    a.x += b.x; a.y += b.y;
+}
+inline __device__ void operator+=(double3& a, double3 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __device__ void operator+=(double4& a, double4 b) {
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+// -= operator
+inline __device__ void operator-=(int2& a, int2 b) {
+    a.x -= b.x; a.y -= b.y;
+}
+inline __device__ void operator-=(int3& a, int3 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __device__ void operator-=(int4& a, int4 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+inline __device__ void operator-=(float2& a, float2 b) {
+    a.x -= b.x; a.y -= b.y;
+}
+inline __device__ void operator-=(float3& a, float3 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __device__ void operator-=(float4& a, float4 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+inline __device__ void operator-=(double2& a, double2 b) {
+    a.x -= b.x; a.y -= b.y;
+}
+inline __device__ void operator-=(double3& a, double3 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __device__ void operator-=(double4& a, double4 b) {
+    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+// *= operator
+inline __device__ void operator*=(int2& a, int2 b) {
+    a.x *= b.x; a.y *= b.y;
+}
+inline __device__ void operator*=(int3& a, int3 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __device__ void operator*=(int4& a, int4 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+inline __device__ void operator*=(float2& a, float2 b) {
+    a.x *= b.x; a.y *= b.y;
+}
+inline __device__ void operator*=(float3& a, float3 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __device__ void operator*=(float4& a, float4 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+inline __device__ void operator*=(double2& a, double2 b) {
+    a.x *= b.x; a.y *= b.y;
+}
+inline __device__ void operator*=(double3& a, double3 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __device__ void operator*=(double4& a, double4 b) {
+    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+// /= operator
+inline __device__ void operator/=(int2& a, int2 b) {
+    a.x /= b.x; a.y /= b.y;
+}
+inline __device__ void operator/=(int3& a, int3 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z;
+}
+inline __device__ void operator/=(int4& a, int4 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
+}
+inline __device__ void operator/=(float2& a, float2 b) {
+    a.x /= b.x; a.y /= b.y;
+}
+inline __device__ void operator/=(float3& a, float3 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z;
+}
+inline __device__ void operator/=(float4& a, float4 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
+}
+inline __device__ void operator/=(double2& a, double2 b) {
+    a.x /= b.x; a.y /= b.y;
+}
+inline __device__ void operator/=(double3& a, double3 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z;
+}
+inline __device__ void operator/=(double4& a, double4 b) {
+    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
+}
+// Multiply a vector by a constant.
+inline __device__ int2 operator*(int2 a, int b) {
+    return make_int2(a.x*b, a.y*b);
+}
+inline __device__ int3 operator*(int3 a, int b) {
+    return make_int3(a.x*b, a.y*b, a.z*b);
+}
+inline __device__ int4 operator*(int4 a, int b) {
+    return make_int4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+inline __device__ int2 operator*(int a, int2 b) {
+    return make_int2(a*b.x, a*b.y);
+}
+inline __device__ int3 operator*(int a, int3 b) {
+    return make_int3(a*b.x, a*b.y, a*b.z);
+}
+inline __device__ int4 operator*(int a, int4 b) {
+    return make_int4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+inline __device__ float2 operator*(float2 a, float b) {
+    return make_float2(a.x*b, a.y*b);
+}
+inline __device__ float3 operator*(float3 a, float b) {
+    return make_float3(a.x*b, a.y*b, a.z*b);
+}
+inline __device__ float4 operator*(float4 a, float b) {
+    return make_float4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+inline __device__ float2 operator*(float a, float2 b) {
+    return make_float2(a*b.x, a*b.y);
+}
+inline __device__ float3 operator*(float a, float3 b) {
+    return make_float3(a*b.x, a*b.y, a*b.z);
+}
+inline __device__ float4 operator*(float a, float4 b) {
+    return make_float4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+inline __device__ double2 operator*(double2 a, double b) {
+    return make_double2(a.x*b, a.y*b);
+}
+inline __device__ double3 operator*(double3 a, double b) {
+    return make_double3(a.x*b, a.y*b, a.z*b);
+}
+inline __device__ double4 operator*(double4 a, double b) {
+    return make_double4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+inline __device__ double2 operator*(double a, double2 b) {
+    return make_double2(a*b.x, a*b.y);
+}
+inline __device__ double3 operator*(double a, double3 b) {
+    return make_double3(a*b.x, a*b.y, a*b.z);
+}
+inline __device__ double4 operator*(double a, double4 b) {
+    return make_double4(a*b.x, a*b.y, a*b.z, a*b.w);
+}