Deleted the old CUDA platform

2e451b9d · Peter Eastman · 352e2fc7 · 352e2fc7 · 352e2fc7 · 352e2fc7
Commit 2e451b9d authored Dec 13, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda-old/CMakeLists.txt
+++ b/platforms/cuda-old/CMakeLists.txt
-#---------------------------------------------------
-# OpenMM CUDA Platform
-#
-# Creates OpenMM library, base name=OpenMMCuda.
-# Default libraries are shared & optimized. Variants
-# are created for static (_static) and debug (_d).
-#
-# Windows:
-#   OpenMMCuda[_d].dll
-#   OpenMMCuda[_d].lib
-#   OpenMMCuda_static[_d].lib
-# Unix:
-#   libOpenMMCuda[_d].so
-#   libOpenMMCuda_static[_d].a
-#----------------------------------------------------
-
-set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
-if(OPENMM_BUILD_CUDA_TESTS)
-    SUBDIRS (tests)
-endif(OPENMM_BUILD_CUDA_TESTS)
-
-# The source is organized into subdirectories, but we handle them all from
-# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-SET(OPENMM_SOURCE_SUBDIRS .)
-
-
-# Collect up information about the version of the OpenMM library we're building
-# and make it available to the code so it can be built into the binaries.
-
-SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
-
-SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
-SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
-
-
-# Ensure that debug libraries have "_d" appended to their names.
-# CMake gets this right on Windows automatically with this definition.
-IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
-    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
-ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
-
-# But on Unix or Cygwin we have to add the suffix manually
-IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-    SET(SHARED_TARGET ${SHARED_TARGET}_d)
-    SET(STATIC_TARGET ${STATIC_TARGET}_d)
-ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-
-
-# These are all the places to search for header files which are
-# to be part of the API.
-SET(API_INCLUDE_DIRS) # start empty
-FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
-    # append
-    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
-                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include 
-                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
-ENDFOREACH(subdir)
-
-# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
-# and absolute pathnames.
-SET(API_REL_INCLUDE_FILES)   # start these out empty
-SET(API_ABS_INCLUDE_FILES)
-
-FOREACH(dir ${API_INCLUDE_DIRS})
-    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
-    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
-
-    FOREACH(pathname ${fullpaths})
-        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
-        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
-    ENDFOREACH(pathname)
-ENDFOREACH(dir)
-
-# collect up source files
-SET(SOURCE_FILES) # empty
-SET(SOURCE_INCLUDE_FILES)
-
-FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
-    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
-    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
-    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
-    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
-    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
-ENDFOREACH(subdir)
-
-INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-
-# SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
-
-SUBDIRS (sharedTarget)
--- a/platforms/cuda-old/include/CudaKernelFactory.h
+++ b/platforms/cuda-old/include/CudaKernelFactory.h
-#ifndef OPENMM_CUDAKERNELFACTORY_H_
-#define OPENMM_CUDAKERNELFACTORY_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "openmm/KernelFactory.h"
-#include "windowsExportCuda.h"
-
-namespace OpenMM {
-
-/**
- * This KernelFactory creates all kernels for CudaPlatform.
- */
-
-class CudaKernelFactory : public KernelFactory {
-public:
-    OPENMMCUDA_EXPORT KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
-};
-
-} // namespace OpenMM
-
-#endif /*OPENMM_CUDAKERNELFACTORY_H_*/
--- a/platforms/cuda-old/include/CudaPlatform.h
+++ b/platforms/cuda-old/include/CudaPlatform.h
-#ifndef OPENMM_CUDAPLATFORM_H_
-#define OPENMM_CUDAPLATFORM_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "openmm/Platform.h"
-#include "windowsExportCuda.h"
-
-struct _gpuContext;
-
-namespace OpenMM {
-
-/**
- * This Platform subclass uses CUDA implementations of the OpenMM kernels to run on NVidia GPUs.
- */
-
-class OPENMMCUDA_EXPORT CudaPlatform : public Platform {
-public:
-    class PlatformData;
-    CudaPlatform();
-    const std::string& getName() const {
-        static const std::string name = "Cuda";
-        return name;
-    }
-    double getSpeed() const {
-        return 50;
-    }
-    bool supportsDoublePrecision() const;
-    const std::string& getPropertyValue(const Context& context, const std::string& property) const;
-    void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
-    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
-    void contextDestroyed(ContextImpl& context) const;
-    /**
-     * This is the name of the parameter for selecting which CUDA device to use.
-     */
-    static const std::string& CudaDevice() {
-        static const std::string key = "CudaDevice";
-        return key;
-    }
-    /**
-     * This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
-     */
-    static const std::string& CudaUseBlockingSync() {
-        static const std::string key = "CudaUseBlockingSync";
-        return key;
-    }
-};
-
-class CudaPlatform::PlatformData {
-public:
-    OPENMMCUDA_EXPORT PlatformData(_gpuContext* gpu);
-    _gpuContext* gpu;
-    bool removeCM;
-    bool hasBonds, hasAngles, hasPeriodicTorsions, hasRB, hasNonbonded, hasCustomNonbonded;
-    int nonbondedMethod, customNonbondedMethod;
-    int cmMotionFrequency;
-    int stepCount, computeForceCount;
-    double time, ewaldSelfEnergy, dispersionCoefficient;
-    std::map<std::string, std::string> propertyValues;
-};
-
-} // namespace OpenMM
-
-#endif /*OPENMM_CUDAPLATFORM_H_*/
--- a/platforms/cuda-old/include/windowsExportCuda.h
+++ b/platforms/cuda-old/include/windowsExportCuda.h
-#ifndef OPENMM_WINDOWSEXPORTCUDA_H_
-#define OPENMM_WINDOWSEXPORTCUDA_H_
-
-/*
- * Shared libraries are messy in Visual Studio. We have to distinguish three
- * cases:
- *   (1) this header is being used to build the OpenMM shared library
- *       (dllexport)
- *   (2) this header is being used by a *client* of the OpenMM shared
- *       library (dllimport)
- *   (3) we are building the OpenMM static library, or the client is
- *       being compiled with the expectation of linking with the
- *       OpenMM static library (nothing special needed)
- * In the CMake script for building this library, we define one of the symbols
- *     OpenMMCUDA_BUILDING_{SHARED|STATIC}_LIBRARY
- * Client code normally has no special symbol defined, in which case we'll
- * assume it wants to use the shared library. However, if the client defines
- * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
- * that the client code can be linked with static libraries. Note that
- * the client symbol is not library dependent, while the library symbols
- * affect only the OpenMM library, meaning that other libraries can
- * be clients of this one. However, we are assuming all-static or all-shared.
- */
-
-#ifdef _MSC_VER
-    // We don't want to hear about how sprintf is "unsafe".
-    #pragma warning(disable:4996)
-    // Keep MS VC++ quiet about lack of dll export of private members.
-    #pragma warning(disable:4251)
-    #if defined(OPENMMCUDA_BUILDING_SHARED_LIBRARY)
-        #define OPENMMCUDA_EXPORT __declspec(dllexport)
-    #elif defined(OPENMMCUDA_BUILDING_STATIC_LIBRARY) || defined(OPENMMCUDA_USE_STATIC_LIBRARIES)
-		#define OPENMMCUDA_EXPORT
-    #else
-		#define OPENMMCUDA_EXPORT __declspec(dllimport)   // i.e., a client of a shared library
-    #endif
-#else
-    #define OPENMMCUDA_EXPORT // Linux, Mac
-#endif
-
-#endif // OPENMM_WINDOWSEXPORTCUDA_H_
--- a/platforms/cuda-old/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda-old/sharedTarget/CMakeLists.txt
-#
-# Include CUDA related files.
-#
-# INCLUDE(${FINDCUDA_DIR}/FindCuda.cmake)
-INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
-LINK_DIRECTORIES(${CUDA_TARGET_LINK})
-FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
-    FILE(GLOB src_files ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src/*.cu  ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src/*/*.cu)
-    SET(SOURCE_FILES ${SOURCE_FILES} ${src_files})
-    CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include)
-    CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src)
-ENDFOREACH(subdir)
-CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
-CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/openmmapi/include)
-
-IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
-ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
-ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-
-IF(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
-    # NVCC doesn't know how to build universal binaries, so we need to build two separate versions.
-
-    SET(BASE_FLAGS ${CUDA_NVCC_FLAGS})
-    SET(CMAKE_OSX_ARCHITECTURES i386)
-    SET(CUDA_NVCC_FLAGS ${BASE_FLAGS} -m32)
-    CUDA_ADD_LIBRARY("${SHARED_TARGET}32" SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET}32 ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
-    SET_TARGET_PROPERTIES(${SHARED_TARGET}32 PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
-    SET(CMAKE_OSX_ARCHITECTURES x86_64)
-    SET(CUDA_NVCC_FLAGS ${BASE_FLAGS} -m64)
-    CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
-    ADD_DEPENDENCIES(${SHARED_TARGET} "${SHARED_TARGET}32")
-
-    # Join them into a single universal binary.
-
-    ADD_CUSTOM_COMMAND(
-        TARGET ${SHARED_TARGET}
-        POST_BUILD
-        COMMAND /usr/bin/lipo lib${SHARED_TARGET}.dylib lib${SHARED_TARGET}32.dylib -create -output lib${SHARED_TARGET}.dylib
-        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-        COMMENT "Creating universal binary")
-ELSE(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
-    CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
-ENDIF(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
-
-INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cuda-old/src/CudaForceInfo.cpp
+++ b/platforms/cuda-old/src/CudaForceInfo.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaForceInfo.h"
-
-using namespace OpenMM;
-using namespace std;
-
-bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
-    return true;
-}
-
-int CudaForceInfo::getNumParticleGroups() {
-    return 0;
-}
-
-void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
-    return;
-}
-
-bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
-    return true;
-}
--- a/platforms/cuda-old/src/CudaForceInfo.h
+++ b/platforms/cuda-old/src/CudaForceInfo.h
-#ifndef OPENMM_CUDAFORCEINFO_H_
-#define OPENMM_CUDAFORCEINFO_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "openmm/internal/windowsExport.h"
-#include <vector>
-
-namespace OpenMM {
-
-/**
- * This class is used by the Cuda implementation of a Force class to convey information
- * about the behavior and requirements of that force.
- */
-
-class CudaForceInfo {
-public:
-    CudaForceInfo() {
-    }
-    virtual ~CudaForceInfo() {
-    }
-    /**
-     * Get whether or not two particles have identical force field parameters.
-     */
-    virtual OPENMM_EXPORT bool areParticlesIdentical(int particle1, int particle2);
-    /**
-     * Get the number of particle groups defined by this force.
-     */
-    virtual OPENMM_EXPORT int getNumParticleGroups();
-    /**
-     * Get the list of particles in a particular group.
-     */
-    virtual OPENMM_EXPORT void getParticlesInGroup(int index, std::vector<int>& particles);
-    /**
-     * Get whether two particle groups are identical.
-     */
-    virtual OPENMM_EXPORT bool areGroupsIdentical(int group1, int group2);
-};
-
-} // namespace OpenMM
-
-#endif /*OPENMM_CUDAFORCEINFO_H_*/
--- a/platforms/cuda-old/src/CudaKernelFactory.cpp
+++ b/platforms/cuda-old/src/CudaKernelFactory.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaKernelFactory.h"
-#include "CudaKernels.h"
-#include "openmm/internal/ContextImpl.h"
-#include "openmm/OpenMMException.h"
-
-using namespace OpenMM;
-
-OPENMMCUDA_EXPORT KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
-    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
-    if (name == CalcForcesAndEnergyKernel::Name())
-        return new CudaCalcForcesAndEnergyKernel(name, platform, data);
-    if (name == UpdateStateDataKernel::Name())
-        return new CudaUpdateStateDataKernel(name, platform, data);
-    if (name == ApplyConstraintsKernel::Name())
-        return new CudaApplyConstraintsKernel(name, platform, data);
-    if (name == VirtualSitesKernel::Name())
-        return new CudaVirtualSitesKernel(name, platform);
-    if (name == CalcHarmonicBondForceKernel::Name())
-        return new CudaCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCustomBondForceKernel::Name())
-        return new CudaCalcCustomBondForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcHarmonicAngleForceKernel::Name())
-        return new CudaCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCustomAngleForceKernel::Name())
-        return new CudaCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcPeriodicTorsionForceKernel::Name())
-        return new CudaCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcRBTorsionForceKernel::Name())
-        return new CudaCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCMAPTorsionForceKernel::Name())
-        return new CudaCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCustomTorsionForceKernel::Name())
-        return new CudaCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcNonbondedForceKernel::Name())
-        return new CudaCalcNonbondedForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCustomNonbondedForceKernel::Name())
-        return new CudaCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcGBSAOBCForceKernel::Name())
-        return new CudaCalcGBSAOBCForceKernel(name, platform, data);
-    if (name == CalcGBVIForceKernel::Name())
-        return new CudaCalcGBVIForceKernel(name, platform, data);
-    if (name == CalcCustomExternalForceKernel::Name())
-        return new CudaCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
-    if (name == IntegrateVerletStepKernel::Name())
-        return new CudaIntegrateVerletStepKernel(name, platform, data);
-    if (name == IntegrateLangevinStepKernel::Name())
-        return new CudaIntegrateLangevinStepKernel(name, platform, data);
-    if (name == IntegrateBrownianStepKernel::Name())
-        return new CudaIntegrateBrownianStepKernel(name, platform, data);
-    if (name == IntegrateVariableVerletStepKernel::Name())
-        return new CudaIntegrateVariableVerletStepKernel(name, platform, data);
-    if (name == IntegrateVariableLangevinStepKernel::Name())
-        return new CudaIntegrateVariableLangevinStepKernel(name, platform, data);
-    if (name == ApplyAndersenThermostatKernel::Name())
-        return new CudaApplyAndersenThermostatKernel(name, platform, data);
-    if (name == ApplyMonteCarloBarostatKernel::Name())
-        return new CudaApplyMonteCarloBarostatKernel(name, platform, data);
-    if (name == CalcKineticEnergyKernel::Name())
-        return new CudaCalcKineticEnergyKernel(name, platform, data);
-    if (name == RemoveCMMotionKernel::Name())
-        return new CudaRemoveCMMotionKernel(name, platform, data);
-    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
-}
--- a/platforms/cuda-old/src/CudaKernels.cpp
+++ b/platforms/cuda-old/src/CudaKernels.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaKernels.h"
-#include "CudaForceInfo.h"
-#include "openmm/LangevinIntegrator.h"
-#include "openmm/Context.h"
-#include "openmm/OpenMMException.h"
-#include "openmm/internal/AndersenThermostatImpl.h"
-#include "openmm/internal/CMAPTorsionForceImpl.h"
-#include "openmm/internal/ContextImpl.h"
-#include "openmm/internal/NonbondedForceImpl.h"
-#include "kernels/gputypes.h"
-#include "kernels/cudaKernels.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
-#include <cmath>
-
-extern "C" int OPENMMCUDA_EXPORT gpuSetConstants( gpuContext gpu );
-
-using namespace OpenMM;
-using namespace std;
-
-void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
-}
-
-void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
-    _gpuContext* gpu = data.gpu;
-    if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0)
-        gpuReorderAtoms(gpu);
-    if ((data.hasNonbonded && data.nonbondedMethod != NO_CUTOFF && data.nonbondedMethod != CUTOFF) ||
-        (data.hasCustomNonbonded && data.customNonbondedMethod != NO_CUTOFF && data.customNonbondedMethod != CUTOFF)) {
-        double minAllowedSize = 1.999999*gpu->sim.nonbondedCutoff;
-        if (gpu->sim.periodicBoxSizeX < minAllowedSize || gpu->sim.periodicBoxSizeY < minAllowedSize || gpu->sim.periodicBoxSizeZ < minAllowedSize)
-            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
-    }
-    data.computeForceCount++;
-    if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
-        kClearBornSumAndForces(gpu);
-    else if (includeForces)
-        kClearForces(gpu);
-    if (includeEnergy)
-        kClearEnergy(gpu);
-}
-
-double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
-    _gpuContext* gpu = data.gpu;
-    if (gpu->bIncludeGBSA || gpu->bIncludeGBVI) {
-        gpu->bRecalculateBornRadii = true;
-        kCalculateCDLJObcGbsaForces1(gpu);
-        kReduceObcGbsaBornForces(gpu);
-        if (gpu->bIncludeGBSA ) {
-           kCalculateObcGbsaForces2(gpu);
-        } else {
-           kCalculateGBVIForces2(gpu);
-        }
-    }
-    else if (data.hasNonbonded)
-        kCalculateCDLJForces(gpu);
-    if (data.hasCustomNonbonded)
-        kCalculateCustomNonbondedForces(gpu, data.hasNonbonded);
-    kCalculateLocalForces(gpu);
-    if (includeForces)
-        kReduceForces(gpu);
-    double energy = 0.0;
-    if (includeEnergy) {
-        energy = kReduceEnergy(gpu)+data.ewaldSelfEnergy;
-        if (data.dispersionCoefficient != 0.0)
-            energy += data.dispersionCoefficient/(gpu->sim.periodicBoxSizeX*gpu->sim.periodicBoxSizeY*gpu->sim.periodicBoxSizeZ);
-    }
-    return energy;
-}
-
-void CudaUpdateStateDataKernel::initialize(const System& system) {
-}
-
-double CudaUpdateStateDataKernel::getTime(const ContextImpl& context) const {
-    return data.time;
-}
-
-void CudaUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
-    data.time = time;
-}
-
-void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, std::vector<Vec3>& positions) {
-    _gpuContext* gpu = data.gpu;
-    gpu->psPosq4->Download();
-    int* order = gpu->psAtomIndex->_pSysData;
-    int numParticles = context.getSystem().getNumParticles();
-    positions.resize(numParticles);
-    for (int i = 0; i < numParticles; ++i) {
-        float4 pos = (*gpu->psPosq4)[i];
-        int3 offset = gpu->posCellOffsets[i];
-        positions[order[i]] = Vec3(pos.x-offset.x*gpu->sim.periodicBoxSizeX, pos.y-offset.y*gpu->sim.periodicBoxSizeY, pos.z-offset.z*gpu->sim.periodicBoxSizeZ);
-    }
-}
-
-void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const std::vector<Vec3>& positions) {
-    _gpuContext* gpu = data.gpu;
-    int* order = gpu->psAtomIndex->_pSysData;
-    int numParticles = context.getSystem().getNumParticles();
-    for (int i = 0; i < numParticles; ++i) {
-        float4& pos = (*gpu->psPosq4)[i];
-        const Vec3& p = positions[order[i]];
-        pos.x = (float) p[0];
-        pos.y = (float) p[1];
-        pos.z = (float) p[2];
-    }
-    gpu->psPosq4->Upload();
-    for (int i = 0; i < (int) gpu->posCellOffsets.size(); i++)
-        gpu->posCellOffsets[i] = make_int3(0, 0, 0);
-}
-
-void CudaUpdateStateDataKernel::getVelocities(ContextImpl& context, std::vector<Vec3>& velocities) {
-    _gpuContext* gpu = data.gpu;
-    gpu->psVelm4->Download();
-    int* order = gpu->psAtomIndex->_pSysData;
-    int numParticles = context.getSystem().getNumParticles();
-    velocities.resize(numParticles);
-    for (int i = 0; i < numParticles; ++i) {
-        float4 vel = (*gpu->psVelm4)[i];
-        velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
-    }
-}
-
-void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities) {
-    _gpuContext* gpu = data.gpu;
-    int* order = gpu->psAtomIndex->_pSysData;
-    int numParticles = context.getSystem().getNumParticles();
-    for (int i = 0; i < numParticles; ++i) {
-        float4& vel = (*gpu->psVelm4)[i];
-        const Vec3& v = velocities[order[i]];
-        vel.x = (float) v[0];
-        vel.y = (float) v[1];
-        vel.z = (float) v[2];
-    }
-    gpu->psVelm4->Upload();
-}
-
-void CudaUpdateStateDataKernel::getForces(ContextImpl& context, std::vector<Vec3>& forces) {
-    _gpuContext* gpu = data.gpu;
-    int* order = gpu->psAtomIndex->_pSysData;
-    gpu->psForce4->Download();
-    int numParticles = context.getSystem().getNumParticles();
-    forces.resize(numParticles);
-    for (int i = 0; i < numParticles; ++i) {
-        float4 force = (*gpu->psForce4)[i];
-        forces[order[i]] = Vec3(force.x, force.y, force.z);
-    }
-}
-
-void CudaUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
-    _gpuContext* gpu = data.gpu;
-    a = Vec3(gpu->sim.periodicBoxSizeX, 0, 0);
-    b = Vec3(0, gpu->sim.periodicBoxSizeY, 0);
-    c = Vec3(0, 0, gpu->sim.periodicBoxSizeZ);
-}
-
-void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const {
-    _gpuContext* gpu = data.gpu;
-    gpuSetPeriodicBoxSize(gpu, a[0], b[1], c[2]);
-    gpuSetConstants(gpu);
-}
-
-void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
-    throw OpenMMException("CudaPlatform does not support checkpointing");
-}
-
-void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
-    throw OpenMMException("CudaPlatform does not support checkpointing");
-}
-
-void CudaApplyConstraintsKernel::initialize(const System& system) {
-}
-
-void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
-    kApplyConstraints(data.gpu);
-}
-
-void CudaVirtualSitesKernel::initialize(const System& system) {
-}
-
-void CudaVirtualSitesKernel::computePositions(ContextImpl& context) {
-}
-
-class CudaCalcHarmonicBondForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const HarmonicBondForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumBonds();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2;
-        double length, k;
-        force.getBondParameters(index, particle1, particle2, length, k);
-        particles.resize(2);
-        particles[0] = particle1;
-        particles[1] = particle2;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2;
-        double length1, length2, k1, k2;
-        force.getBondParameters(group1, particle1, particle2, length1, k1);
-        force.getBondParameters(group2, particle1, particle2, length2, k2);
-        return (length1 == length2 && k1 == k2);
-    }
-private:
-    const HarmonicBondForce& force;
-};
-
-CudaCalcHarmonicBondForceKernel::~CudaCalcHarmonicBondForceKernel() {
-}
-
-void CudaCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
-    data.hasBonds = true;
-    numBonds = force.getNumBonds();
-    vector<int> particle1(numBonds);
-    vector<int> particle2(numBonds);
-    vector<float> length(numBonds);
-    vector<float> k(numBonds);
-    for (int i = 0; i < numBonds; i++) {
-        double lengthValue, kValue;
-        force.getBondParameters(i, particle1[i], particle2[i], lengthValue, kValue);
-        length[i] = (float) lengthValue;
-        k[i] = (float) kValue;
-    }
-    gpuSetBondParameters(data.gpu, particle1, particle2, length, k);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    return 0.0;
-}
-
-void CudaCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcCustomBondForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const CustomBondForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumBonds();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2;
-        vector<double> parameters;
-        force.getBondParameters(index, particle1, particle2, parameters);
-        particles.resize(2);
-        particles[0] = particle1;
-        particles[1] = particle2;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2;
-        vector<double> parameters1, parameters2;
-        force.getBondParameters(group1, particle1, particle2, parameters1);
-        force.getBondParameters(group2, particle1, particle2, parameters2);
-        for (int i = 0; i < (int) parameters1.size(); i++)
-            if (parameters1[i] != parameters2[i])
-                return false;
-        return true;
-    }
-private:
-    const CustomBondForce& force;
-};
-
-CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
-}
-
-void CudaCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
-    numBonds = force.getNumBonds();
-    vector<int> particle1(numBonds);
-    vector<int> particle2(numBonds);
-    vector<vector<double> > params(numBonds);
-    for (int i = 0; i < numBonds; i++)
-        force.getBondParameters(i, particle1[i], particle2[i], params[i]);
-    vector<string> paramNames;
-    for (int i = 0; i < force.getNumPerBondParameters(); i++)
-        paramNames.push_back(force.getPerBondParameterName(i));
-    globalParamNames.resize(force.getNumGlobalParameters());
-    globalParamValues.resize(force.getNumGlobalParameters());
-    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-        globalParamNames[i] = force.getGlobalParameterName(i);
-        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
-    }
-    gpuSetCustomBondParameters(data.gpu, particle1, particle2, params, force.getEnergyFunction(), paramNames, globalParamNames);
-    if (globalParamValues.size() > 0)
-        SetCustomBondGlobalParams(globalParamValues);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    updateGlobalParams(context);
-    kCalculateCustomBondForces(data.gpu);
-    return 0.0;
-}
-
-void CudaCalcCustomBondForceKernel::updateGlobalParams(ContextImpl& context) {
-    bool changed = false;
-    for (int i = 0; i < (int) globalParamNames.size(); i++) {
-        float value = (float) context.getParameter(globalParamNames[i]);
-        if (value != globalParamValues[i])
-            changed = true;
-        globalParamValues[i] = value;
-    }
-    if (changed)
-        SetCustomBondGlobalParams(globalParamValues);
-}
-
-void CudaCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcHarmonicAngleForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const HarmonicAngleForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumAngles();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3;
-        double angle, k;
-        force.getAngleParameters(index, particle1, particle2, particle3, angle, k);
-        particles.resize(3);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3;
-        double angle1, angle2, k1, k2;
-        force.getAngleParameters(group1, particle1, particle2, particle3, angle1, k1);
-        force.getAngleParameters(group2, particle1, particle2, particle3, angle2, k2);
-        return (angle1 == angle2 && k1 == k2);
-    }
-private:
-    const HarmonicAngleForce& force;
-};
-
-CudaCalcHarmonicAngleForceKernel::~CudaCalcHarmonicAngleForceKernel() {
-}
-
-void CudaCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
-    data.hasAngles = true;
-    numAngles = force.getNumAngles();
-    const float RadiansToDegrees = (float) (180.0/3.14159265);
-    vector<int> particle1(numAngles);
-    vector<int> particle2(numAngles);
-    vector<int> particle3(numAngles);
-    vector<float> angle(numAngles);
-    vector<float> k(numAngles);
-    for (int i = 0; i < numAngles; i++) {
-        double angleValue, kValue;
-        force.getAngleParameters(i, particle1[i], particle2[i], particle3[i], angleValue, kValue);
-        angle[i] = (float) (angleValue*RadiansToDegrees);
-        k[i] = (float) kValue;
-    }
-    gpuSetBondAngleParameters(data.gpu, particle1, particle2, particle3, angle, k);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    return 0.0;
-}
-
-void CudaCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcCustomAngleForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const CustomAngleForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumAngles();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3;
-        vector<double> parameters;
-        force.getAngleParameters(index, particle1, particle2, particle3, parameters);
-        particles.resize(3);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3;
-        vector<double> parameters1, parameters2;
-        force.getAngleParameters(group1, particle1, particle2, particle3, parameters1);
-        force.getAngleParameters(group2, particle1, particle2, particle3, parameters2);
-        for (int i = 0; i < (int) parameters1.size(); i++)
-            if (parameters1[i] != parameters2[i])
-                return false;
-        return true;
-    }
-private:
-    const CustomAngleForce& force;
-};
-
-CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() {
-}
-
-void CudaCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
-    numAngles = force.getNumAngles();
-    vector<int> particle1(numAngles);
-    vector<int> particle2(numAngles);
-    vector<int> particle3(numAngles);
-    vector<vector<double> > params(numAngles);
-    for (int i = 0; i < numAngles; i++)
-        force.getAngleParameters(i, particle1[i], particle2[i], particle3[i], params[i]);
-    vector<string> paramNames;
-    for (int i = 0; i < force.getNumPerAngleParameters(); i++)
-        paramNames.push_back(force.getPerAngleParameterName(i));
-    globalParamNames.resize(force.getNumGlobalParameters());
-    globalParamValues.resize(force.getNumGlobalParameters());
-    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-        globalParamNames[i] = force.getGlobalParameterName(i);
-        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
-    }
-    gpuSetCustomAngleParameters(data.gpu, particle1, particle2, particle3, params, force.getEnergyFunction(), paramNames, globalParamNames);
-    if (globalParamValues.size() > 0)
-        SetCustomAngleGlobalParams(globalParamValues);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    updateGlobalParams(context);
-    kCalculateCustomAngleForces(data.gpu);
-    return 0.0;
-}
-
-void CudaCalcCustomAngleForceKernel::updateGlobalParams(ContextImpl& context) {
-    bool changed = false;
-    for (int i = 0; i < (int) globalParamNames.size(); i++) {
-        float value = (float) context.getParameter(globalParamNames[i]);
-        if (value != globalParamValues[i])
-            changed = true;
-        globalParamValues[i] = value;
-    }
-    if (changed)
-        SetCustomAngleGlobalParams(globalParamValues);
-}
-
-void CudaCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcPeriodicTorsionForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const PeriodicTorsionForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumTorsions();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3, particle4, periodicity;
-        double phase, k;
-        force.getTorsionParameters(index, particle1, particle2, particle3, particle4, periodicity, phase, k);
-        particles.resize(4);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-        particles[3] = particle4;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3, particle4, periodicity1, periodicity2;
-        double phase1, phase2, k1, k2;
-        force.getTorsionParameters(group1, particle1, particle2, particle3, particle4, periodicity1, phase1, k1);
-        force.getTorsionParameters(group2, particle1, particle2, particle3, particle4, periodicity2, phase2, k2);
-        return (periodicity1 == periodicity2 && phase1 == phase2 && k1 == k2);
-    }
-private:
-    const PeriodicTorsionForce& force;
-};
-
-CudaCalcPeriodicTorsionForceKernel::~CudaCalcPeriodicTorsionForceKernel() {
-}
-
-void CudaCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
-    data.hasPeriodicTorsions = true;
-    numTorsions = force.getNumTorsions();
-    const float RadiansToDegrees = (float)(180.0/3.14159265);
-    vector<int> particle1(numTorsions);
-    vector<int> particle2(numTorsions);
-    vector<int> particle3(numTorsions);
-    vector<int> particle4(numTorsions);
-    vector<float> k(numTorsions);
-    vector<float> phase(numTorsions);
-    vector<int> periodicity(numTorsions);
-    for (int i = 0; i < numTorsions; i++) {
-        double kValue, phaseValue;
-        force.getTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], periodicity[i], phaseValue, kValue);
-        k[i] = (float) kValue;
-        phase[i] = (float) (phaseValue*RadiansToDegrees);
-    }
-    gpuSetDihedralParameters(data.gpu, particle1, particle2, particle3, particle4, k, phase, periodicity);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    return 0.0;
-}
-
-void CudaCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcRBTorsionForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const RBTorsionForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumTorsions();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3, particle4;
-        double c0, c1, c2, c3, c4, c5;
-        force.getTorsionParameters(index, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
-        particles.resize(4);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-        particles[3] = particle4;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3, particle4;
-        double c0a, c0b, c1a, c1b, c2a, c2b, c3a, c3b, c4a, c4b, c5a, c5b;
-        force.getTorsionParameters(group1, particle1, particle2, particle3, particle4, c0a, c1a, c2a, c3a, c4a, c5a);
-        force.getTorsionParameters(group2, particle1, particle2, particle3, particle4, c0b, c1b, c2b, c3b, c4b, c5b);
-        return (c0a == c0b && c1a == c1b && c2a == c2b && c3a == c3b && c4a == c4b && c5a == c5b);
-    }
-private:
-    const RBTorsionForce& force;
-};
-
-CudaCalcRBTorsionForceKernel::~CudaCalcRBTorsionForceKernel() {
-}
-
-void CudaCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
-    data.hasRB = true;
-    numTorsions = force.getNumTorsions();
-    vector<int> particle1(numTorsions);
-    vector<int> particle2(numTorsions);
-    vector<int> particle3(numTorsions);
-    vector<int> particle4(numTorsions);
-    vector<float> c0(numTorsions);
-    vector<float> c1(numTorsions);
-    vector<float> c2(numTorsions);
-    vector<float> c3(numTorsions);
-    vector<float> c4(numTorsions);
-    vector<float> c5(numTorsions);
-    for (int i = 0; i < numTorsions; i++) {
-        double c[6];
-        force.getTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], c[0], c[1], c[2], c[3], c[4], c[5]);
-        c0[i] = (float) c[0];
-        c1[i] = (float) c[1];
-        c2[i] = (float) c[2];
-        c3[i] = (float) c[3];
-        c4[i] = (float) c[4];
-        c5[i] = (float) c[5];
-    }
-    gpuSetRbDihedralParameters(data.gpu, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    return 0.0;
-}
-
-void CudaCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcCMAPTorsionForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const CMAPTorsionForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumTorsions();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int map, a1, a2, a3, a4, b1, b2, b3, b4;
-        force.getTorsionParameters(index, map, a1, a2, a3, a4, b1, b2, b3, b4);
-        particles.resize(8);
-        particles[0] = a1;
-        particles[1] = a2;
-        particles[2] = a3;
-        particles[3] = a4;
-        particles[4] = b1;
-        particles[5] = b2;
-        particles[6] = b3;
-        particles[7] = b4;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int map1, map2, a1, a2, a3, a4, b1, b2, b3, b4;
-        force.getTorsionParameters(group1, map1, a1, a2, a3, a4, b1, b2, b3, b4);
-        force.getTorsionParameters(group2, map2, a1, a2, a3, a4, b1, b2, b3, b4);
-        return (map1 == map2);
-    }
-private:
-    const CMAPTorsionForce& force;
-};
-
-CudaCalcCMAPTorsionForceKernel::~CudaCalcCMAPTorsionForceKernel() {
-    if (coefficients != NULL)
-        delete coefficients;
-    if (mapPositions != NULL)
-        delete mapPositions;
-    if (torsionMaps != NULL)
-        delete torsionMaps;
-    if (torsionIndices != NULL)
-        delete torsionIndices;
-}
-
-void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
-    numTorsions = force.getNumTorsions();
-    if (numTorsions == 0)
-        return;
-    int numMaps = force.getNumMaps();
-    vector<float4> coeffVec;
-    vector<int2> mapPositionsVec(numMaps);
-    vector<double> energy;
-    vector<vector<double> > c;
-    int currentPosition = 0;
-    mapPositions = new CUDAStream<int2>(numMaps, 1, "cmapTorsionMapPositions");
-    for (int i = 0; i < numMaps; i++) {
-        int size;
-        force.getMapParameters(i, size, energy);
-        CMAPTorsionForceImpl::calcMapDerivatives(size, energy, c);
-        (*mapPositions)[i] = make_int2(currentPosition, size);
-        currentPosition += 4*size*size;
-        for (int j = 0; j < size*size; j++) {
-            coeffVec.push_back(make_float4(c[j][0], c[j][1], c[j][2], c[j][3]));
-            coeffVec.push_back(make_float4(c[j][4], c[j][5], c[j][6], c[j][7]));
-            coeffVec.push_back(make_float4(c[j][8], c[j][9], c[j][10], c[j][11]));
-            coeffVec.push_back(make_float4(c[j][12], c[j][13], c[j][14], c[j][15]));
-        }
-    }
-    coefficients = new CUDAStream<float4>((int) coeffVec.size(), 1, "cmapTorsionCoefficients");;
-    for (int i = 0; i < (int) coeffVec.size(); i++)
-        (*coefficients)[i] = coeffVec[i];
-    torsionMaps = new CUDAStream<int>(numTorsions, 1, "cmapTorsionMaps");
-    torsionIndices = new CUDAStream<int4>(4*numTorsions, 1, "cmapTorsionIndices");
-    vector<int> forceBufferCounter(system.getNumParticles(), 0);
-    for (int i = 0; i < numTorsions; i++) {
-        int map, a1, a2, a3, a4, b1, b2, b3, b4;
-        force.getTorsionParameters(i, map, a1, a2, a3, a4, b1, b2, b3, b4);
-        (*torsionMaps)[i] = map;
-        (*torsionIndices)[i*4] = make_int4(a1, a2, a3, a4);
-        (*torsionIndices)[i*4+1] = make_int4(b1, b2, b3, b4);
-        (*torsionIndices)[i*4+2] = make_int4(forceBufferCounter[a1]++, forceBufferCounter[a2]++, forceBufferCounter[a3]++, forceBufferCounter[a4]++);
-        (*torsionIndices)[i*4+3] = make_int4(forceBufferCounter[b1]++, forceBufferCounter[b2]++, forceBufferCounter[b3]++, forceBufferCounter[b4]++);
-    }
-    coefficients->Upload();
-    mapPositions->Upload();
-    torsionMaps->Upload();
-    torsionIndices->Upload();
-    int maxBuffers = 1;
-    for (int i = 0; i < (int) forceBufferCounter.size(); i++)
-        maxBuffers = max(maxBuffers, forceBufferCounter[i]);
-    if (maxBuffers > data.gpu->sim.outputBuffers)
-        data.gpu->sim.outputBuffers = maxBuffers;
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    if( numTorsions )
-        kCalculateCMAPTorsionForces(data.gpu, *coefficients, *mapPositions, *torsionIndices, *torsionMaps);
-    return 0.0;
-}
-
-class CudaCalcCustomTorsionForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const CustomTorsionForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumTorsions();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3, particle4;
-        vector<double> parameters;
-        force.getTorsionParameters(index, particle1, particle2, particle3, particle4, parameters);
-        particles.resize(4);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-        particles[3] = particle4;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3, particle4;
-        vector<double> parameters1, parameters2;
-        force.getTorsionParameters(group1, particle1, particle2, particle3, particle4, parameters1);
-        force.getTorsionParameters(group2, particle1, particle2, particle3, particle4, parameters2);
-        for (int i = 0; i < (int) parameters1.size(); i++)
-            if (parameters1[i] != parameters2[i])
-                return false;
-        return true;
-    }
-private:
-    const CustomTorsionForce& force;
-};
-
-CudaCalcCustomTorsionForceKernel::~CudaCalcCustomTorsionForceKernel() {
-}
-
-void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
-    numTorsions = force.getNumTorsions();
-    vector<int> particle1(numTorsions);
-    vector<int> particle2(numTorsions);
-    vector<int> particle3(numTorsions);
-    vector<int> particle4(numTorsions);
-    vector<vector<double> > params(numTorsions);
-    for (int i = 0; i < numTorsions; i++)
-        force.getTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], params[i]);
-    vector<string> paramNames;
-    for (int i = 0; i < force.getNumPerTorsionParameters(); i++)
-        paramNames.push_back(force.getPerTorsionParameterName(i));
-    globalParamNames.resize(force.getNumGlobalParameters());
-    globalParamValues.resize(force.getNumGlobalParameters());
-    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-        globalParamNames[i] = force.getGlobalParameterName(i);
-        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
-    }
-    gpuSetCustomTorsionParameters(data.gpu, particle1, particle2, particle3, particle4, params, force.getEnergyFunction(), paramNames, globalParamNames);
-    if (globalParamValues.size() > 0)
-        SetCustomTorsionGlobalParams(globalParamValues);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    updateGlobalParams(context);
-    kCalculateCustomTorsionForces(data.gpu);
-    return 0.0;
-}
-
-void CudaCalcCustomTorsionForceKernel::updateGlobalParams(ContextImpl& context) {
-    bool changed = false;
-    for (int i = 0; i < (int) globalParamNames.size(); i++) {
-        float value = (float) context.getParameter(globalParamNames[i]);
-        if (value != globalParamValues[i])
-            changed = true;
-        globalParamValues[i] = value;
-    }
-    if (changed)
-        SetCustomTorsionGlobalParams(globalParamValues);
-}
-
-void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcNonbondedForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const NonbondedForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
-        force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
-        force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
-        return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
-    }
-    int getNumParticleGroups() {
-        return force.getNumExceptions();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2;
-        double chargeProd, sigma, epsilon;
-        force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
-        particles.resize(2);
-        particles[0] = particle1;
-        particles[1] = particle2;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2;
-        double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
-        force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
-        force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
-        return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
-    }
-private:
-    const NonbondedForce& force;
-};
-
-CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
-}
-
-void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
-    data.hasNonbonded = true;
-    numParticles = force.getNumParticles();
-    _gpuContext* gpu = data.gpu;
-
-    // Identify which exceptions are 1-4 interactions.
-
-    vector<pair<int, int> > exclusions;
-    vector<int> exceptions;
-    for (int i = 0; i < force.getNumExceptions(); i++) {
-        int particle1, particle2;
-        double chargeProd, sigma, epsilon;
-        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
-        exclusions.push_back(pair<int, int>(particle1, particle2));
-        if (chargeProd != 0.0 || epsilon != 0.0)
-            exceptions.push_back(i);
-    }
-
-    // Initialize nonbonded interactions.
-    
-    {
-        vector<int> particle(numParticles);
-        vector<float> c6(numParticles);
-        vector<float> c12(numParticles);
-        vector<float> q(numParticles);
-        vector<char> symbol;
-        vector<vector<int> > exclusionList(numParticles);
-        for (int i = 0; i < numParticles; i++) {
-            double charge, radius, depth;
-            force.getParticleParameters(i, charge, radius, depth);
-            particle[i] = i;
-            q[i] = (float) charge;
-            c6[i] = (float) (4*depth*pow(radius, 6.0));
-            c12[i] = (float) (4*depth*pow(radius, 12.0));
-            exclusionList[i].push_back(i);
-        }
-        for (int i = 0; i < (int)exclusions.size(); i++) {
-            exclusionList[exclusions[i].first].push_back(exclusions[i].second);
-            exclusionList[exclusions[i].second].push_back(exclusions[i].first);
-        }
-        CudaNonbondedMethod method = NO_CUTOFF;
-        if (force.getNonbondedMethod() != NonbondedForce::NoCutoff) {
-            gpuSetNonbondedCutoff(gpu, (float) force.getCutoffDistance(), (float) force.getReactionFieldDielectric());
-            method = CUTOFF;
-        }
-        if (force.getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
-            method = PERIODIC;
-        }
-        if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
-            if (force.getReciprocalSpaceForceGroup() > 0)
-                throw OpenMMException("CudaPlatform does not support force groups");
-            if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
-                double alpha;
-                int kmaxx, kmaxy, kmaxz;
-                NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
-                gpuSetEwaldParameters(gpu, (float) alpha, kmaxx, kmaxy, kmaxz);
-                method = EWALD;
-            }
-            else {
-                double alpha;
-                int gridSizeX, gridSizeY, gridSizeZ;
-                NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
-                gpuSetPMEParameters(gpu, (float) alpha, gridSizeX, gridSizeY, gridSizeZ);
-                method = PARTICLE_MESH_EWALD;
-            }
-        }
-        data.nonbondedMethod = method;
-        gpuSetCoulombParameters(gpu, (float) ONE_4PI_EPS0, particle, c6, c12, q, symbol, exclusionList, method);
-
-        // Compute the Ewald self energy.
-
-        data.ewaldSelfEnergy = 0.0;
-        if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
-            double selfEnergyScale = gpu->sim.epsfac*gpu->sim.alphaEwald/std::sqrt(PI);
-                for (int i = 0; i < numParticles; i++)
-                    data.ewaldSelfEnergy -= selfEnergyScale*q[i]*q[i];
-        }
-
-        // Compute the long range dispersion correction.
-
-        if (force.getUseDispersionCorrection())
-            data.dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
-        else
-            data.dispersionCoefficient = 0.0;
-    }
-
-    // Initialize 1-4 nonbonded interactions.
-    
-    {
-        int numExceptions = exceptions.size();
-        vector<int> particle1(numExceptions);
-        vector<int> particle2(numExceptions);
-        vector<float> c6(numExceptions);
-        vector<float> c12(numExceptions);
-        vector<float> q1(numExceptions);
-        vector<float> q2(numExceptions);
-        for (int i = 0; i < numExceptions; i++) {
-            double charge, sig, eps;
-            force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps);
-            c6[i] = (float) (4*eps*pow(sig, 6.0));
-            c12[i] = (float) (4*eps*pow(sig, 12.0));
-            q1[i] = (float) charge;
-            q2[i] = 1.0f;
-        }
-        gpuSetLJ14Parameters(gpu, (float) ONE_4PI_EPS0, 1.0f, particle1, particle2, c6, c12, q1, q2);
-    }
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
-    return 0.0;
-}
-
-void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcCustomNonbondedForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const CustomNonbondedForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        vector<double> params1;
-        vector<double> params2;
-        force.getParticleParameters(particle1, params1);
-        force.getParticleParameters(particle2, params2);
-        for (int i = 0; i < (int) params1.size(); i++)
-            if (params1[i] != params2[i])
-                return false;
-        return true;
-    }
-    int getNumParticleGroups() {
-        return force.getNumExclusions();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2;
-        force.getExclusionParticles(index, particle1, particle2);
-        particles.resize(2);
-        particles[0] = particle1;
-        particles[1] = particle2;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        return true;
-    }
-private:
-    const CustomNonbondedForce& force;
-};
-
-CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
-}
-
-void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
-    data.hasCustomNonbonded = true;
-    numParticles = force.getNumParticles();
-    _gpuContext* gpu = data.gpu;
-
-    // Initialize nonbonded interactions.
-
-    vector<int> particle(numParticles);
-    vector<vector<double> > parameters(numParticles);
-    vector<vector<int> > exclusionList(numParticles);
-    for (int i = 0; i < numParticles; i++) {
-        force.getParticleParameters(i, parameters[i]);
-        particle[i] = i;
-        exclusionList[i].push_back(i);
-    }
-    for (int i = 0; i < force.getNumExclusions(); i++) {
-        int particle1, particle2;
-        force.getExclusionParticles(i, particle1, particle2);
-        exclusionList[particle1].push_back(particle2);
-        exclusionList[particle2].push_back(particle1);
-    }
-    CudaNonbondedMethod method = NO_CUTOFF;
-    if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff)
-        method = CUTOFF;
-    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic) {
-        method = PERIODIC;
-    }
-    data.customNonbondedMethod = method;
-
-    // Record the tabulated functions.
-
-    for (int i = 0; i < force.getNumFunctions(); i++) {
-        string name;
-        vector<double> values;
-        double min, max;
-        force.getFunctionParameters(i, name, values, min, max);
-        gpuSetTabulatedFunction(gpu, i, name, values, min, max);
-    }
-
-    // Record information for the expressions.
-
-    vector<string> paramNames;
-    for (int i = 0; i < force.getNumPerParticleParameters(); i++)
-        paramNames.push_back(force.getPerParticleParameterName(i));
-    globalParamNames.resize(force.getNumGlobalParameters());
-    globalParamValues.resize(force.getNumGlobalParameters());
-    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-        globalParamNames[i] = force.getGlobalParameterName(i);
-        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
-    }
-    gpuSetCustomNonbondedParameters(gpu, parameters, exclusionList, method, (float) force.getCutoffDistance(), force.getEnergyFunction(), paramNames, globalParamNames);
-    if (globalParamValues.size() > 0)
-        SetCustomNonbondedGlobalParams(globalParamValues);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    updateGlobalParams(context);
-    return 0.0;
-}
-
-void CudaCalcCustomNonbondedForceKernel::updateGlobalParams(ContextImpl& context) {
-    bool changed = false;
-    for (int i = 0; i < (int) globalParamNames.size(); i++) {
-        float value = (float) context.getParameter(globalParamNames[i]);
-        if (value != globalParamValues[i])
-            changed = true;
-        globalParamValues[i] = value;
-    }
-    if (changed)
-        SetCustomNonbondedGlobalParams(globalParamValues);
-}
-
-void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcGBSAOBCForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const GBSAOBCForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double charge1, charge2, radius1, radius2, scale1, scale2;
-        force.getParticleParameters(particle1, charge1, radius1, scale1);
-        force.getParticleParameters(particle2, charge2, radius2, scale2);
-        return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
-    }
-private:
-    const GBSAOBCForce& force;
-};
-
-CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
-}
-
-void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
-
-    int numParticles = system.getNumParticles();
-    _gpuContext* gpu = data.gpu;
-    vector<float> radius(numParticles);
-    vector<float> scale(numParticles);
-    vector<float> charge(numParticles);
-    for (int i = 0; i < numParticles; i++) {
-        double particleCharge, particleRadius, scalingFactor;
-        force.getParticleParameters(i, particleCharge, particleRadius, scalingFactor);
-        radius[i] = (float) particleRadius;
-        scale[i] = (float) scalingFactor;
-        charge[i] = (float) particleCharge;
-    }
-    gpuSetObcParameters(gpu, (float) force.getSoluteDielectric(), (float) force.getSolventDielectric(), radius, scale, charge);
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-	return 0.0;
-}
-
-void CudaCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-class CudaCalcGBVIForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const GBVIForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double charge1, charge2, radius1, radius2, gamma1, gamma2;
-        force.getParticleParameters(particle1, charge1, radius1, gamma1);
-        force.getParticleParameters(particle2, charge2, radius2, gamma2);
-        return (charge1 == charge2 && radius1 == radius2 && gamma1 == gamma2);
-    }
-private:
-    const GBVIForce& force;
-};
-
-CudaCalcGBVIForceKernel::~CudaCalcGBVIForceKernel() {
-}
-
-void CudaCalcGBVIForceKernel::initialize(const System& system, const GBVIForce& force, const std::vector<double> & inputScaledRadii) {
-
-    int numParticles = system.getNumParticles();
-    _gpuContext* gpu = data.gpu;
-
-    vector<int> particle(numParticles);
-    vector<float> radius(numParticles);
-    vector<float> scaledRadii(numParticles);
-    vector<float> gammas(numParticles);
-
-    for (int i = 0; i < numParticles; i++) {
-        double charge, particleRadius, gamma;
-        force.getParticleParameters(i, charge, particleRadius, gamma );
-        particle[i]                  = i;
-        radius[i]                    = (float) particleRadius;
-        gammas[i]                    = (float) gamma;
-        scaledRadii[i]               = (float) inputScaledRadii[i];
-    }
-
-    int gbviBornRadiusScalingMethod;
-    if( force.getBornRadiusScalingMethod() ==  GBVIForce::QuinticSpline ){
-        gbviBornRadiusScalingMethod = 1;
-    } else {
-        gbviBornRadiusScalingMethod = 2;
-    }
-    gpuSetGBVIParameters(gpu, (float) force.getSoluteDielectric(), (float) force.getSolventDielectric(), particle,
-                         radius, gammas, scaledRadii, gbviBornRadiusScalingMethod,
-                         static_cast<float>(force.getQuinticLowerLimitFactor()),
-                         static_cast<float>(force.getQuinticUpperBornRadiusLimit()) );
-
-    data.gpu->forces.push_back(new ForceInfo(force));
-}
-
-double CudaCalcGBVIForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    return 0.0;
-}
-
-class CudaCalcCustomExternalForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const CustomExternalForce& force, int numParticles) : force(force), indices(numParticles, -1) {
-        vector<double> params;
-        for (int i = 0; i < force.getNumParticles(); i++) {
-            int particle;
-            force.getParticleParameters(i, particle, params);
-            indices[particle] = i;
-        }
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        particle1 = indices[particle1];
-        particle2 = indices[particle2];
-        if (particle1 == -1 && particle2 == -1)
-            return true;
-        if (particle1 == -1 || particle2 == -1)
-            return false;
-        int temp;
-        vector<double> params1;
-        vector<double> params2;
-        force.getParticleParameters(particle1, temp, params1);
-        force.getParticleParameters(particle2, temp, params2);
-        for (int i = 0; i < (int) params1.size(); i++)
-            if (params1[i] != params2[i])
-                return false;
-        return true;
-    }
-private:
-    const CustomExternalForce& force;
-    vector<int> indices;
-};
-
-CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() {
-}
-
-void CudaCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
-    numParticles = force.getNumParticles();
-    vector<int> particle(numParticles);
-    vector<vector<double> > params(numParticles);
-    for (int i = 0; i < numParticles; i++)
-        force.getParticleParameters(i, particle[i], params[i]);
-    vector<string> paramNames;
-    for (int i = 0; i < force.getNumPerParticleParameters(); i++)
-        paramNames.push_back(force.getPerParticleParameterName(i));
-    globalParamNames.resize(force.getNumGlobalParameters());
-    globalParamValues.resize(force.getNumGlobalParameters());
-    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
-        globalParamNames[i] = force.getGlobalParameterName(i);
-        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
-    }
-    gpuSetCustomExternalParameters(data.gpu, particle, params, force.getEnergyFunction(), paramNames, globalParamNames);
-    if (globalParamValues.size() > 0)
-        SetCustomExternalGlobalParams(globalParamValues);
-    data.gpu->forces.push_back(new ForceInfo(force, system.getNumParticles()));
-}
-
-double CudaCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    updateGlobalParams(context);
-    kCalculateCustomExternalForces(data.gpu);
-    return 0.0;
-}
-
-void CudaCalcCustomExternalForceKernel::updateGlobalParams(ContextImpl& context) {
-    bool changed = false;
-    for (int i = 0; i < (int) globalParamNames.size(); i++) {
-        float value = (float) context.getParameter(globalParamNames[i]);
-        if (value != globalParamValues[i])
-            changed = true;
-        globalParamValues[i] = value;
-    }
-    if (changed)
-        SetCustomExternalGlobalParams(globalParamValues);
-}
-
-void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force) {
-    throw OpenMMException("CudaPlatform does not support copyParametersToContext");
-}
-
-void OPENMMCUDA_EXPORT OpenMM::cudaOpenMMInitializeIntegration(const System& system, CudaPlatform::PlatformData& data, const Integrator& integrator) {
-
-    // Initialize any terms that haven't already been handled by a Force.
-
-    _gpuContext* gpu = data.gpu;
-    if (!data.hasBonds)
-        gpuSetBondParameters(gpu, vector<int>(), vector<int>(), vector<float>(), vector<float>());
-    if (!data.hasAngles)
-        gpuSetBondAngleParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>());
-    if (!data.hasPeriodicTorsions)
-        gpuSetDihedralParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>(), vector<int>());
-    if (!data.hasRB)
-        gpuSetRbDihedralParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>(),
-                vector<float>(), vector<float>(), vector<float>(), vector<float>());
-    if (!data.hasNonbonded) {
-        gpuSetCoulombParameters(gpu, (float) ONE_4PI_EPS0, vector<int>(), vector<float>(), vector<float>(), vector<float>(), vector<char>(), vector<vector<int> >(), NO_CUTOFF);
-        gpuSetLJ14Parameters(gpu, (float) ONE_4PI_EPS0, 1.0f, vector<int>(), vector<int>(), vector<float>(), vector<float>(), vector<float>(), vector<float>());
-        if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
-            throw OpenMMException("CudaPlatform requires GBSAOBCForce and GBVIForce to be used with a NonbondedForce");
-    }
-    
-    // Set masses.
-    
-    int numParticles = system.getNumParticles();
-    vector<float> mass(numParticles);
-    for (int i = 0; i < numParticles; i++)
-        mass[i] = (float) system.getParticleMass(i);
-    gpuSetMass(gpu, mass);
-    
-    // Set constraints.
-    
-    int numConstraints = system.getNumConstraints();
-    vector<int> particle1(numConstraints);
-    vector<int> particle2(numConstraints);
-    vector<float> distance(numConstraints);
-    vector<float> invMass1(numConstraints);
-    vector<float> invMass2(numConstraints);
-    for (int i = 0; i < numConstraints; i++) {
-        int particle1Index, particle2Index;
-        double constraintDistance;
-        system.getConstraintParameters(i, particle1Index, particle2Index, constraintDistance);
-        particle1[i] = particle1Index;
-        particle2[i] = particle2Index;
-        distance[i] = (float) constraintDistance;
-        invMass1[i] = 1.0f/mass[particle1Index];
-        invMass2[i] = 1.0f/mass[particle2Index];
-    }
-    gpuSetConstraintParameters(gpu, particle1, particle2, distance, invMass1, invMass2, (float)integrator.getConstraintTolerance());
-    
-    // Finish initialization.
-
-    gpuBuildThreadBlockWorkList(gpu);
-    gpuBuildExclusionList(gpu);
-    gpuBuildOutputBuffers(gpu);
-    gpuSetConstants(gpu);
-    if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
-        kClearBornSumAndForces(gpu);
-    else
-        kClearForces(gpu);
-    cudaThreadSynchronize();
-}
-
-CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
-}
-
-void CudaIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
-    cudaOpenMMInitializeIntegration(system, data, integrator);
-    prevStepSize = -1.0;
-}
-
-void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
-    _gpuContext* gpu = data.gpu;
-    double stepSize = integrator.getStepSize();
-    if (stepSize != prevStepSize) {
-        // Initialize the GPU parameters.
-        
-        gpuSetVerletIntegrationParameters(gpu, (float) stepSize, 0.0f);
-        gpuSetConstants(gpu);
-        prevStepSize = stepSize;
-    }
-    kVerletUpdatePart1(gpu);
-    kApplyShake(gpu);
-    kApplySettle(gpu);
-    kApplyCCMA(gpu);
-    if (data.removeCM)
-        if (data.stepCount%data.cmMotionFrequency == 0)
-            gpu->bCalculateCM = true;
-    kVerletUpdatePart2(gpu);
-    data.time += stepSize;
-    data.stepCount++;
-}
-
-CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
-}
-
-void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
-    cudaOpenMMInitializeIntegration(system, data, integrator);
-    _gpuContext* gpu = data.gpu;
-    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
-    gpuInitializeRandoms(gpu);
-    prevTemp = -1.0;
-    prevFriction = -1.0;
-    prevStepSize = -1.0;
-}
-
-void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
-    _gpuContext* gpu = data.gpu;
-    double temperature = integrator.getTemperature();
-    double friction = integrator.getFriction();
-    double stepSize = integrator.getStepSize();
-    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
-        // Initialize the GPU parameters.
-        
-        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-        gpuSetLangevinIntegrationParameters(gpu, (float) tau, (float) stepSize, (float) temperature, 0.0f);
-        gpuSetConstants(gpu);
-        kGenerateRandoms(gpu);
-        prevTemp = temperature;
-        prevFriction = friction;
-        prevStepSize = stepSize;
-    }
-    kLangevinUpdatePart1(gpu);
-    if (data.removeCM)
-        if (data.stepCount%data.cmMotionFrequency == 0)
-            gpu->bCalculateCM = true;
-    kLangevinUpdatePart2(gpu);
-    kApplyShake(gpu);
-    kApplySettle(gpu);
-    kApplyCCMA(gpu);
-    kSetVelocitiesFromPositions(gpu);
-    data.time += stepSize;
-    data.stepCount++;
-}
-
-CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
-}
-
-void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
-    cudaOpenMMInitializeIntegration(system, data, integrator);
-    _gpuContext* gpu = data.gpu;
-    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
-    gpuInitializeRandoms(gpu);
-    prevTemp = -1.0;
-    prevFriction = -1.0;
-    prevStepSize = -1.0;
-}
-
-void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const BrownianIntegrator& integrator) {
-    _gpuContext* gpu = data.gpu;
-    double temperature = integrator.getTemperature();
-    double friction = integrator.getFriction();
-    double stepSize = integrator.getStepSize();
-    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
-        // Initialize the GPU parameters.
-        
-        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-        gpuSetBrownianIntegrationParameters(gpu, (float) tau, (float) stepSize, (float) temperature);
-        gpuSetConstants(gpu);
-        kGenerateRandoms(gpu);
-        prevTemp = temperature;
-        prevFriction = friction;
-        prevStepSize = stepSize;
-    }
-    kBrownianUpdatePart1(gpu);
-    kApplyShake(gpu);
-    kApplySettle(gpu);
-    kApplyCCMA(gpu);
-    if (data.removeCM)
-        if (data.stepCount%data.cmMotionFrequency == 0)
-            gpu->bCalculateCM = true;
-    kBrownianUpdatePart2(gpu);
-    data.time += stepSize;
-    data.stepCount++;
-}
-
-CudaIntegrateVariableVerletStepKernel::~CudaIntegrateVariableVerletStepKernel() {
-}
-
-void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
-    cudaOpenMMInitializeIntegration(system, data, integrator);
-    prevErrorTol = -1.0;
-}
-
-double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
-    _gpuContext* gpu = data.gpu;
-    double errorTol = integrator.getErrorTolerance();
-    if (errorTol != prevErrorTol) {
-        // Initialize the GPU parameters.
-
-        gpuSetVerletIntegrationParameters(gpu, 0.0f, (float) errorTol);
-        gpuSetConstants(gpu);
-        prevErrorTol = errorTol;
-    }
-    float maxStepSize = (float)(maxTime-data.time);
-    kSelectVerletStepSize(gpu, maxStepSize);
-    kVerletUpdatePart1(gpu);
-    kApplyShake(gpu);
-    kApplySettle(gpu);
-    kApplyCCMA(gpu);
-    if (data.removeCM)
-        if (data.stepCount%data.cmMotionFrequency == 0)
-            gpu->bCalculateCM = true;
-    kVerletUpdatePart2(gpu);
-    gpu->psStepSize->Download();
-    data.time += (*gpu->psStepSize)[0].y;
-    if ((*gpu->psStepSize)[0].y == maxStepSize)
-        data.time = maxTime; // Avoid round-off error
-    data.stepCount++;
-    return (*gpu->psStepSize)[0].y;
-}
-
-CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() {
-}
-
-void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
-    cudaOpenMMInitializeIntegration(system, data, integrator);
-    _gpuContext* gpu = data.gpu;
-    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
-    gpuInitializeRandoms(gpu);
-    prevTemp = -1.0;
-    prevFriction = -1.0;
-    prevErrorTol = -1.0;
-}
-
-double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
-    _gpuContext* gpu = data.gpu;
-    double temperature = integrator.getTemperature();
-    double friction = integrator.getFriction();
-    double errorTol = integrator.getErrorTolerance();
-    if (temperature != prevTemp || friction != prevFriction || errorTol != prevErrorTol) {
-        // Initialize the GPU parameters.
-
-        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-        gpuSetLangevinIntegrationParameters(gpu, (float) tau, 0.0f, (float) temperature, (float) errorTol);
-        gpuSetConstants(gpu);
-        kGenerateRandoms(gpu);
-        prevTemp = temperature;
-        prevFriction = friction;
-        prevErrorTol = errorTol;
-    }
-    float maxStepSize = (float)(maxTime-data.time);
-    kSelectLangevinStepSize(gpu, maxStepSize);
-    kLangevinUpdatePart1(gpu);
-    if (data.removeCM)
-        if (data.stepCount%data.cmMotionFrequency == 0)
-            gpu->bCalculateCM = true;
-    kLangevinUpdatePart2(gpu);
-    kApplyShake(gpu);
-    kApplySettle(gpu);
-    kApplyCCMA(gpu);
-    kSetVelocitiesFromPositions(gpu);
-    gpu->psStepSize->Download();
-    data.time += (*gpu->psStepSize)[0].y;
-    if ((*gpu->psStepSize)[0].y == maxStepSize)
-        data.time = maxTime; // Avoid round-off error
-    data.stepCount++;
-    return (*gpu->psStepSize)[0].y;
-}
-
-CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() {
-    if (atomGroups != NULL)
-        delete atomGroups;
-}
-
-void CudaApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) {
-    _gpuContext* gpu = data.gpu;
-    gpu->seed = (unsigned long) thermostat.getRandomNumberSeed();
-    gpuInitializeRandoms(gpu);
-    prevTemp = -1.0;
-    prevFrequency = -1.0;
-    prevStepSize = -1.0;
-
-    // Create the arrays with the group definitions.
-
-    vector<vector<int> > groups = AndersenThermostatImpl::calcParticleGroups(system);
-    atomGroups = new CUDAStream<int>(system.getNumParticles(), 1, "atomGroups");
-    for (int i = 0; i < (int) groups.size(); i++) {
-        for (int j = 0; j < (int) groups[i].size(); j++)
-            (*atomGroups)[groups[i][j]] = i;
-    }
-    atomGroups->Upload();
-}
-
-void CudaApplyAndersenThermostatKernel::execute(ContextImpl& context) {
-    _gpuContext* gpu = data.gpu;
-    double temperature = context.getParameter(AndersenThermostat::Temperature());
-    double frequency = context.getParameter(AndersenThermostat::CollisionFrequency());
-    double stepSize = context.getIntegrator().getStepSize();
-    if (temperature != prevTemp || frequency != prevFrequency || stepSize != prevStepSize) {
-        // Initialize the GPU parameters.
-        
-        gpuSetAndersenThermostatParameters(gpu, (float) temperature, (float) frequency);
-        gpuSetConstants(gpu);
-        kGenerateRandoms(gpu);
-        prevTemp = temperature;
-        prevFrequency = frequency;
-        prevStepSize = stepSize;
-    }
-    kCalculateAndersenThermostat(gpu, *atomGroups);
-}
-
-CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
-    if (moleculeAtoms != NULL)
-        delete moleculeAtoms;
-    if (moleculeStartIndex != NULL)
-        delete moleculeStartIndex;
-}
-
-void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const MonteCarloBarostat& thermostat) {
-}
-
-void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scale) {
-    if (!hasInitializedMolecules) {
-        hasInitializedMolecules = true;
-
-        // Create the arrays with the molecule definitions.
-
-        vector<vector<int> > molecules = context.getMolecules();
-        numMolecules = molecules.size();
-        moleculeAtoms = new CUDAStream<int>(context.getSystem().getNumParticles(), 1, "moleculeAtoms");
-        moleculeStartIndex = new CUDAStream<int>(numMolecules+1, 1, "moleculeStartIndex");
-        int index = 0;
-        for (int i = 0; i < numMolecules; i++) {
-            (*moleculeStartIndex)[i] = index;
-            for (int j = 0; j < (int) molecules[i].size(); j++)
-                (*moleculeAtoms)[index++] = molecules[i][j];
-        }
-        (*moleculeStartIndex)[numMolecules] = index;
-        moleculeAtoms->Upload();
-        moleculeStartIndex->Upload();
-    }
-    _gpuContext* gpu = data.gpu;
-    gpu->psPosqP4->CopyFrom(*gpu->psPosq4);
-    kScaleAtomCoordinates(gpu, scale, *moleculeAtoms, *moleculeStartIndex);
-    for (int i = 0; i < (int) gpu->posCellOffsets.size(); i++)
-        gpu->posCellOffsets[i] = make_int3(0, 0, 0);
-}
-
-void CudaApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context) {
-    _gpuContext* gpu = data.gpu;
-    gpu->psPosq4->CopyFrom(*gpu->psPosqP4);
-}
-
-void CudaCalcKineticEnergyKernel::initialize(const System& system) {
-    int numParticles = system.getNumParticles();
-    masses.resize(numParticles);
-    for (int i = 0; i < numParticles; ++i)
-        masses[i] = system.getParticleMass(i);
-}
-
-double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {
-    // We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy
-    // on the CPU.
-    
-    _gpuContext* gpu = data.gpu;
-    gpu->psVelm4->Download();
-    double energy = 0.0;
-    for (int i = 0; i < (int) masses.size(); ++i) {
-        float4 v = (*gpu->psVelm4)[i];
-        energy += masses[i]*(v.x*v.x+v.y*v.y+v.z*v.z);
-    }
-    return 0.5*energy;
-}
-
-void CudaRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) {
-    data.removeCM = true;
-    data.cmMotionFrequency = force.getFrequency();
-}
-
-void CudaRemoveCMMotionKernel::execute(ContextImpl& context) {
-}
--- a/platforms/cuda-old/src/CudaKernels.h
+++ b/platforms/cuda-old/src/CudaKernels.h
-#ifndef OPENMM_CUDAKERNELS_H_
-#define OPENMM_CUDAKERNELS_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaPlatform.h"
-#include "openmm/kernels.h"
-#include "kernels/gputypes.h"
-#include "openmm/System.h"
-
-class CudaAndersenThermostat;
-class CudaBrownianDynamics;
-class CudaStochasticDynamics;
-class CudaShakeAlgorithm;
-class CudaVerletDynamics;
-
-namespace OpenMM {
-
-// Export internal cudaOpenMMInitializeIntegration() method so it can be used by NML plugin
-void OPENMMCUDA_EXPORT cudaOpenMMInitializeIntegration(const System& system, CudaPlatform::PlatformData& data, const Integrator& integrator);
-
-/**
- * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
- * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
- * necessary work at the end to determine the final results.
- */
-class CudaCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
-public:
-    CudaCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcForcesAndEnergyKernel(name, platform), data(data) {
-    }
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
-     * any ForceImpl.
-     *
-     * @param context       the context in which to execute this kernel
-     * @param includeForce  true if forces should be computed
-     * @param includeEnergy true if potential energy should be computed
-     * @param groups        a set of bit flags for which force groups to include
-     */
-    void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
-    /**
-     * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
-     * every ForceImpl.
-     *
-     * @param context       the context in which to execute this kernel
-     * @param includeForce  true if forces should be computed
-     * @param includeEnergy true if potential energy should be computed
-     * @param groups        a set of bit flags for which force groups to include
-     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
-     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
-     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
-     */
-    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
-private:
-    CudaPlatform::PlatformData& data;
-};
-
-/**
- * This kernel provides methods for setting and retrieving various state data: time, positions,
- * velocities, and forces.
- */
-class CudaUpdateStateDataKernel : public UpdateStateDataKernel {
-public:
-    CudaUpdateStateDataKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : UpdateStateDataKernel(name, platform), data(data) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * Get the current time (in picoseconds).
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    double getTime(const ContextImpl& context) const;
-    /**
-     * Set the current time (in picoseconds).
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void setTime(ContextImpl& context, double time);
-    /**
-     * Get the positions of all particles.
-     *
-     * @param positions  on exit, this contains the particle positions
-     */
-    void getPositions(ContextImpl& context, std::vector<Vec3>& positions);
-    /**
-     * Set the positions of all particles.
-     *
-     * @param positions  a vector containg the particle positions
-     */
-    void setPositions(ContextImpl& context, const std::vector<Vec3>& positions);
-    /**
-     * Get the velocities of all particles.
-     *
-     * @param velocities  on exit, this contains the particle velocities
-     */
-    void getVelocities(ContextImpl& context, std::vector<Vec3>& velocities);
-    /**
-     * Set the velocities of all particles.
-     *
-     * @param velocities  a vector containg the particle velocities
-     */
-    void setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities);
-    /**
-     * Get the current forces on all particles.
-     *
-     * @param forces  on exit, this contains the forces
-     */
-    void getForces(ContextImpl& context, std::vector<Vec3>& forces);
-    /**
-     * Get the current periodic box vectors.
-     *
-     * @param a      on exit, this contains the vector defining the first edge of the periodic box
-     * @param b      on exit, this contains the vector defining the second edge of the periodic box
-     * @param c      on exit, this contains the vector defining the third edge of the periodic box
-     */
-    void getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const;
-    /**
-     * Set the current periodic box vectors.
-     *
-     * @param a      the vector defining the first edge of the periodic box
-     * @param b      the vector defining the second edge of the periodic box
-     * @param c      the vector defining the third edge of the periodic box
-     */
-    void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
-    /**
-     * Create a checkpoint recording the current state of the Context.
-     * 
-     * @param stream    an output stream the checkpoint data should be written to
-     */
-    void createCheckpoint(ContextImpl& context, std::ostream& stream);
-    /**
-     * Load a checkpoint that was written by createCheckpoint().
-     * 
-     * @param stream    an input stream the checkpoint data should be read from
-     */
-    void loadCheckpoint(ContextImpl& context, std::istream& stream);
-private:
-    CudaPlatform::PlatformData& data;
-};
-
-/**
- * This kernel modifies the positions of particles to enforce distance constraints.
- */
-class CudaApplyConstraintsKernel : public ApplyConstraintsKernel {
-public:
-    CudaApplyConstraintsKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : ApplyConstraintsKernel(name, platform), data(data) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * Update particle positions to enforce constraints.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param tol        the distance tolerance within which constraints must be satisfied.
-     */
-    void apply(ContextImpl& context, double tol);
-private:
-    CudaPlatform::PlatformData& data;
-};
-
-/**
- * This kernel recomputes the positions of virtual sites.
- */
-class CudaVirtualSitesKernel : public VirtualSitesKernel {
-public:
-    CudaVirtualSitesKernel(std::string name, const Platform& platform) : VirtualSitesKernel(name, platform) {
-    }
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * Compute the virtual site locations.
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void computePositions(ContextImpl& context);
-};
-
-/**
- * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
-public:
-    CudaCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcHarmonicBondForceKernel(name, platform), data(data), system(system) {
-    }
-    ~CudaCalcHarmonicBondForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the HarmonicBondForce this kernel will be used for
-     */
-    void initialize(const System& system, const HarmonicBondForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the HarmonicBondForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
-private:
-    class ForceInfo;
-    int numBonds;
-    CudaPlatform::PlatformData& data;
-    System& system;
-};
-
-/**
- * This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
-public:
-    CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomBondForceKernel(name, platform),
-            data(data), system(system) {
-    }
-    ~CudaCalcCustomBondForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomBondForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomBondForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomBondForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
-private:
-    class ForceInfo;
-    void updateGlobalParams(ContextImpl& context);
-    int numBonds;
-    CudaPlatform::PlatformData& data;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    System& system;
-};
-
-/**
- * This kernel is invoked by HarmonicAngleForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
-public:
-    CudaCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcHarmonicAngleForceKernel(name, platform), data(data), system(system) {
-    }
-    ~CudaCalcHarmonicAngleForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the HarmonicAngleForce this kernel will be used for
-     */
-    void initialize(const System& system, const HarmonicAngleForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the HarmonicAngleForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
-private:
-    class ForceInfo;
-    int numAngles;
-    CudaPlatform::PlatformData& data;
-    System& system;
-};
-
-/**
- * This kernel is invoked by CustomAngleForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
-public:
-    CudaCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomAngleForceKernel(name, platform),
-            data(data), system(system) {
-    }
-    ~CudaCalcCustomAngleForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomAngleForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomAngleForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomAngleForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
-private:
-    class ForceInfo;
-    void updateGlobalParams(ContextImpl& context);
-    int numAngles;
-    CudaPlatform::PlatformData& data;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    System& system;
-};
-
-/**
- * This kernel is invoked by PeriodicTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
-public:
-    CudaCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcPeriodicTorsionForceKernel(name, platform), data(data), system(system) {
-    }
-    ~CudaCalcPeriodicTorsionForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the PeriodicTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const PeriodicTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the PeriodicTorsionForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
-private:
-    class ForceInfo;
-    int numTorsions;
-    CudaPlatform::PlatformData& data;
-    System& system;
-};
-
-/**
- * This kernel is invoked by RBTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
-public:
-    CudaCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcRBTorsionForceKernel(name, platform), data(data), system(system) {
-    }
-    ~CudaCalcRBTorsionForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the RBTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const RBTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the RBTorsionForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
-private:
-    class ForceInfo;
-    int numTorsions;
-    CudaPlatform::PlatformData& data;
-    System& system;
-};
-
-/**
- * This kernel is invoked by CMAPTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
-public:
-    CudaCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
-            CalcCMAPTorsionForceKernel(name, platform), data(data), system(system), coefficients(NULL), mapPositions(NULL),
-            torsionIndices(NULL), torsionMaps(NULL) {
-    }
-    ~CudaCalcCMAPTorsionForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CMAPTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const CMAPTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    CudaPlatform::PlatformData& data;
-    System& system;
-    int numTorsions;
-    CUDAStream<float4>* coefficients;
-    CUDAStream<int2>* mapPositions;
-    CUDAStream<int4>* torsionIndices;
-    CUDAStream<int>* torsionMaps;
-};
-
-/**
- * This kernel is invoked by CustomTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
-public:
-    CudaCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomTorsionForceKernel(name, platform),
-            data(data), system(system) {
-    }
-    ~CudaCalcCustomTorsionForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomTorsionForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
-private:
-    class ForceInfo;
-    void updateGlobalParams(ContextImpl& context);
-    int numTorsions;
-    CudaPlatform::PlatformData& data;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    System& system;
-};
-
-/**
- * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
- */
-class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
-public:
-    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcNonbondedForceKernel(name, platform), data(data), system(system) {
-    }
-    ~CudaCalcNonbondedForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the NonbondedForce this kernel will be used for
-     */
-    void initialize(const System& system, const NonbondedForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @param includeReciprocal  true if reciprocal space interactions should be included
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the NonbondedForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
-private:
-    class ForceInfo;
-    CudaPlatform::PlatformData& data;
-    int numParticles;
-    System& system;
-};
-
-/**
- * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
- */
-class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
-public:
-    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomNonbondedForceKernel(name, platform), data(data), system(system) {
-    }
-    ~CudaCalcCustomNonbondedForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomNonbondedForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomNonbondedForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomNonbondedForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
-private:
-    class ForceInfo;
-    void updateGlobalParams(ContextImpl& context);
-    CudaPlatform::PlatformData& data;
-    int numParticles;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    System& system;
-};
-
-/**
- * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
- */
-class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
-public:
-    CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcGBSAOBCForceKernel(name, platform), data(data) {
-    }
-    ~CudaCalcGBSAOBCForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the GBSAOBCForce this kernel will be used for
-     */
-    void initialize(const System& system, const GBSAOBCForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the GBSAOBCForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
-private:
-    class ForceInfo;
-    CudaPlatform::PlatformData& data;
-};
-
-/**
- * This kernel is invoked by GBVIForce to calculate the forces acting on the system.
- */
-class CudaCalcGBVIForceKernel : public CalcGBVIForceKernel {
-public:
-    CudaCalcGBVIForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcGBVIForceKernel(name, platform), data(data) {
-    }
-    ~CudaCalcGBVIForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system      the System this kernel will be applied to
-     * @param force       the GBVIForce this kernel will be used for
-     * @param scaledRadii the scaled radii (Eq. 5 of Labute paper)
-     */
-    void initialize(const System& system, const GBVIForce& force, const std::vector<double> & scaledRadii);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    CudaPlatform::PlatformData& data;
-};
-
-/**
- * This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
-public:
-    CudaCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomExternalForceKernel(name, platform),
-            data(data), system(system) {
-    }
-    ~CudaCalcCustomExternalForceKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param force      the CustomExternalForce this kernel will be used for
-     */
-    void initialize(const System& system, const CustomExternalForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Copy changed parameters over to a context.
-     *
-     * @param context    the context to copy parameters to
-     * @param force      the CustomNonbondedForce to copy the parameters from
-     */
-    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
-private:
-    class ForceInfo;
-    void updateGlobalParams(ContextImpl& context);
-    int numParticles;
-    CudaPlatform::PlatformData& data;
-    std::vector<std::string> globalParamNames;
-    std::vector<float> globalParamValues;
-    System& system;
-};
-
-/**
- * This kernel is invoked by VerletIntegrator to take one time step.
- */
-class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
-public:
-    CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVerletStepKernel(name, platform), data(data) {
-    }
-    ~CudaIntegrateVerletStepKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param integrator the VerletIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const VerletIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VerletIntegrator this kernel is being used for
-     */
-    void execute(ContextImpl& context, const VerletIntegrator& integrator);
-private:
-    CudaPlatform::PlatformData& data;
-    double prevStepSize;
-};
-
-/**
- * This kernel is invoked by LangevinIntegrator to take one time step.
- */
-class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
-public:
-    CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateLangevinStepKernel(name, platform), data(data) {
-    }
-    ~CudaIntegrateLangevinStepKernel();
-    /**
-     * Initialize the kernel, setting up the particle masses.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param integrator the LangevinIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const LangevinIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the LangevinIntegrator this kernel is being used for
-     */
-    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
-private:
-    CudaPlatform::PlatformData& data;
-    double prevTemp, prevFriction, prevStepSize;
-};
-
-/**
- * This kernel is invoked by BrownianIntegrator to take one time step.
- */
-class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
-public:
-    CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateBrownianStepKernel(name, platform), data(data) {
-    }
-    ~CudaIntegrateBrownianStepKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param integrator the BrownianIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const BrownianIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     * @param integrator the BrownianIntegrator this kernel is being used for
-     */
-    void execute(ContextImpl& context, const BrownianIntegrator& integrator);
-private:
-    CudaPlatform::PlatformData& data;
-    double prevTemp, prevFriction, prevStepSize;
-};
-
-/**
- * This kernel is invoked by VariableVerletIntegrator to take one time step.
- */
-class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
-public:
-    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVariableVerletStepKernel(name, platform), data(data) {
-    }
-    ~CudaIntegrateVariableVerletStepKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param integrator the VerletIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const VariableVerletIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VerletIntegrator this kernel is being used for
-     * @param maxTime    the maximum time beyond which the simulation should not be advanced
-     * @return the size of the step that was taken
-     */
-    double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
-private:
-    CudaPlatform::PlatformData& data;
-    double prevErrorTol;
-};
-
-/**
- * This kernel is invoked by VariableLangevinIntegrator to take one time step.
- */
-class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
-public:
-    CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVariableLangevinStepKernel(name, platform), data(data) {
-    }
-    ~CudaIntegrateVariableLangevinStepKernel();
-    /**
-     * Initialize the kernel, setting up the particle masses.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param integrator the VariableLangevinIntegrator this kernel will be used for
-     */
-    void initialize(const System& system, const VariableLangevinIntegrator& integrator);
-    /**
-     * Execute the kernel.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param integrator the VariableLangevinIntegrator this kernel is being used for
-     * @param maxTime    the maximum time beyond which the simulation should not be advanced
-     * @return the size of the step that was taken
-     */
-    double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
-private:
-    CudaPlatform::PlatformData& data;
-    double prevTemp, prevFriction, prevErrorTol;
-};
-
-/**
- * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.
- */
-class CudaApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
-public:
-    CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : ApplyAndersenThermostatKernel(name, platform),
-            data(data), atomGroups(NULL) {
-    }
-    ~CudaApplyAndersenThermostatKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param thermostat the AndersenThermostat this kernel will be used for
-     */
-    void initialize(const System& system, const AndersenThermostat& thermostat);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     */
-    void execute(ContextImpl& context);
-private:
-    CudaPlatform::PlatformData& data;
-    double prevTemp, prevFrequency, prevStepSize;
-    CUDAStream<int>* atomGroups;
-};
-
-/**
- * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
- */
-class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
-public:
-    CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : ApplyMonteCarloBarostatKernel(name, platform), data(data),
-            hasInitializedMolecules(false), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
-    }
-    ~CudaApplyMonteCarloBarostatKernel();
-    /**
-     * Initialize the kernel.
-     *
-     * @param system     the System this kernel will be applied to
-     * @param barostat   the MonteCarloBarostat this kernel will be used for
-     */
-    void initialize(const System& system, const MonteCarloBarostat& barostat);
-    /**
-     * Attempt a Monte Carlo step, scaling particle positions (or cluster centers) by a specified value.
-     * This is called BEFORE the periodic box size is modified.  It should begin by translating each particle
-     * or cluster into the first periodic box, so that coordinates will still be correct after the box size
-     * is changed.
-     *
-     * @param context    the context in which to execute this kernel
-     * @param scale      the scale factor by which to multiply particle positions
-     */
-    void scaleCoordinates(ContextImpl& context, double scale);
-    /**
-     * Reject the most recent Monte Carlo step, restoring the particle positions to where they were before
-     * scaleCoordinates() was last called.
-     *
-     * @param context    the context in which to execute this kernel
-     */
-    void restoreCoordinates(ContextImpl& context);
-private:
-    CudaPlatform::PlatformData& data;
-    bool hasInitializedMolecules;
-    int numMolecules;
-    CUDAStream<int>* moleculeAtoms;
-    CUDAStream<int>* moleculeStartIndex;
-};
-
-/**
- * This kernel is invoked to calculate the kinetic energy of the system.
- */
-class CudaCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
-public:
-    CudaCalcKineticEnergyKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcKineticEnergyKernel(name, platform), data(data) {
-    }
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     */
-    void initialize(const System& system);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     */
-    double execute(ContextImpl& context);
-private:
-    CudaPlatform::PlatformData& data;
-    std::vector<double> masses;
-};
-
-/**
- * This kernel is invoked to remove center of mass motion from the system.
- */
-class CudaRemoveCMMotionKernel : public RemoveCMMotionKernel {
-public:
-    CudaRemoveCMMotionKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : RemoveCMMotionKernel(name, platform), data(data) {
-    }
-    /**
-     * Initialize the kernel, setting up the particle masses.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the CMMotionRemover this kernel will be used for
-     */
-    void initialize(const System& system, const CMMotionRemover& force);
-    /**
-     * Execute the kernel.
-     * 
-     * @param context    the context in which to execute this kernel
-     */
-    void execute(ContextImpl& context);
-private:
-    CudaPlatform::PlatformData& data;
-};
-
-} // namespace OpenMM
-
-#endif /*OPENMM_CUDAKERNELS_H_*/
--- a/platforms/cuda-old/src/CudaPlatform.cpp
+++ b/platforms/cuda-old/src/CudaPlatform.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaPlatform.h"
-#include "CudaKernelFactory.h"
-#include "CudaKernels.h"
-#include "openmm/internal/ContextImpl.h"
-#include "kernels/gputypes.h"
-#include "openmm/Context.h"
-#include "openmm/OpenMMException.h"
-#include "openmm/System.h"
-#include <sstream>
-
-using namespace OpenMM;
-using std::map;
-using std::string;
-using std::stringstream;
-
-extern "C" OPENMMCUDA_EXPORT void registerPlatforms() {
-    if (gpuIsAvailable())
-        Platform::registerPlatform(new CudaPlatform());
-}
-
-CudaPlatform::CudaPlatform() {
-    CudaKernelFactory* factory = new CudaKernelFactory();
-    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
-    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
-    registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
-    registerKernelFactory(VirtualSitesKernel::Name(), factory);
-    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
-    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
-    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
-    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
-    registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
-    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
-    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
-    registerKernelFactory(CalcGBVIForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
-    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
-    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
-    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
-    registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
-    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
-    platformProperties.push_back(CudaDevice());
-    platformProperties.push_back(CudaUseBlockingSync());
-    setPropertyDefaultValue(CudaDevice(), "0");
-    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
-}
-
-bool CudaPlatform::supportsDoublePrecision() const {
-    return false;
-}
-
-const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
-    const ContextImpl& impl = getContextImpl(context);
-    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
-    map<string, string>::const_iterator value = data->propertyValues.find(property);
-    if (value != data->propertyValues.end())
-        return value->second;
-    return Platform::getPropertyValue(context, property);
-}
-
-void CudaPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
-}
-
-void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
-    System& system = context.getSystem();
-    for (int i = 0; i < system.getNumParticles(); i++)
-        if (system.isVirtualSite(i))
-            throw OpenMMException("CudaPlatform does not support virtual sites");
-    for (int i = 0; i < system.getNumForces(); i++)
-        if (system.getForce(i).getForceGroup() != 0)
-            throw OpenMMException("CudaPlatform does not support force groups");
-    unsigned int device = 0;
-    const string& devicePropValue = (properties.find(CudaDevice()) == properties.end() ?
-            getPropertyDefaultValue(CudaDevice()) : properties.find(CudaDevice())->second);
-    if (devicePropValue.length() > 0)
-        stringstream(devicePropValue) >> device;
-    int numParticles = context.getSystem().getNumParticles();
-    const string& blockingSync = (properties.find(CudaUseBlockingSync()) == properties.end() ?
-            getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
-    _gpuContext* gpu = (_gpuContext*) gpuInit(numParticles, device, blockingSync == "true");
-    context.setPlatformData(new PlatformData(gpu));
-}
-
-void CudaPlatform::contextDestroyed(ContextImpl& context) const {
-    PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
-    gpuShutDown(data->gpu);
-    delete data;
-}
-
-CudaPlatform::PlatformData::PlatformData(_gpuContext* gpu) : gpu(gpu), removeCM(false), nonbondedMethod(0), customNonbondedMethod(0), hasBonds(false), hasAngles(false),
-        hasPeriodicTorsions(false), hasRB(false), hasNonbonded(false), hasCustomNonbonded(false), stepCount(0), computeForceCount(0), time(0.0),
-        ewaldSelfEnergy(0.0), dispersionCoefficient(0.0) {
-    stringstream device;
-    device << gpu->device;
-    propertyValues[CudaPlatform::CudaDevice()] = device.str();
-    propertyValues[CudaPlatform::CudaUseBlockingSync()] = (gpu->useBlockingSync ? "true" : "false");
-}
--- a/platforms/cuda-old/src/kernels/bbsort.cu
+++ b/platforms/cuda-old/src/kernels/bbsort.cu
-/*
- * Authored by: Chen, Shifu
- * 
- * Email: chen@gmtk.org
- *
- * Website: http://www.gmtk.org/gsort
- *
- * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
- * 
- */
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "vector_types.h"
-#include "bbsort.h"
-#include "bbsort_kernel.cu"
-
-
-int getValue(int2 v){
-	return v.y;
-}
-
-template <typename T>
-T getValue(T v){
-	return v;
-}
-
-#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
-    cudaError err = call;                                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
-
-bool assignSliceToBuckets(unsigned int* sliceCount,int sliceSize,unsigned int* bucketOffset,unsigned int* bucketOfSlice,unsigned int* bucketSizes,unsigned int* sliceOffsetInBucket,int& bucketsCount,float step)
-{
-	int i=0;
-
-	bool overflow=false;
-
-	int tmpSum=0;
-
-	bucketOffset[0]=0;
-
-	for(i=0;i<sliceSize; i++){
-		if(sliceCount[i] >BLOCK_SIZE)
-		{
-			overflow=true;
-		}
-
-		tmpSum += sliceCount[i];
-		bucketOfSlice[i]=bucketsCount;
-		bucketSizes[bucketsCount] = tmpSum;
-		sliceOffsetInBucket[i]=tmpSum -sliceCount[i];
-		if(tmpSum > BLOCK_SIZE )
-		{	
-			if(i != 0)
-			{
-				bucketOfSlice[i]=bucketsCount+1;
-				bucketSizes[bucketsCount] -= sliceCount[i];
-				sliceOffsetInBucket[i]=0;
-				bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum -  sliceCount[i];
-
-				bucketsCount++;
-				tmpSum=sliceCount[i];
-				bucketSizes[bucketsCount] = tmpSum;
-			}
-			else 
-			{
-				bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum ;
-				sliceOffsetInBucket[i]=0;
-				tmpSum=0;
-				bucketsCount++;
-			}
-		}
-
-	}
-	bucketsCount++;
-
-	return overflow;
-
-}
-
-template <typename T>
-void reduceMinMax(T* dData,int size,float& result,bool isMax)
-{
-
-	int step;
-	step=(size%2==0)?
-		(size/2):(size/2 +1);
-	int blockSize=BLOCK_SIZE;
-	int blockCount;
-	int length=size;
-	T originalResult;
-	while(step > 0)
-	{
-		if(step%BLOCK_SIZE==0)
-			blockCount=step/BLOCK_SIZE;
-		else 
-			blockCount=step/BLOCK_SIZE+1;
-
-		if(isMax)
-			reduceMaxD<<<blockCount,blockSize>>>(dData,step,length);
-		else 
-			reduceMinD<<<blockCount,blockSize>>>(dData,step,length);
-
-		length=step;
-
-		step=(step%2==0 || step==1)?(step/2):(step/2 +1);
-	}
-
-	CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost));
-
-	result=(int)getValue(originalResult);
-}
-
-template <typename T>
-void evaluateDisorder(T* dData,int size,float maxValue, float minValue, int& listOrder)
-{
-	int blockCount;
-
-	if((size-1) % BLOCK_SIZE ==0)blockCount=size/BLOCK_SIZE;
-	else blockCount=size/BLOCK_SIZE+1;
-
-	float* dDiffData;
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dDiffData, sizeof(float) * size));
-
-	calDifferenceD<<<blockCount,BLOCK_SIZE,(BLOCK_SIZE)*sizeof(T)>>>(dData,dDiffData,size);
-
-	float sum=0;
-
-	int step;
-	step=(size%2==0)?
-		(size/2):(size/2 +1);
-
-	int blockSize=BLOCK_SIZE;
-
-	int length=size;
-
-	while(step > 0)
-	{
-
-		if(step%BLOCK_SIZE==0)
-			blockCount=step/BLOCK_SIZE;
-		else 
-			blockCount=step/BLOCK_SIZE+1;
-
-		reduceSumD<<<blockCount,blockSize>>>(dDiffData,step,length);
-
-		length=step;
-
-		step=(step%2==0 || step==1)?(step/2):(step/2 +1);
-	}
-
-	CUDA_SAFE_CALL(cudaMemcpy(&sum, dDiffData, sizeof(float), cudaMemcpyDeviceToHost));
-
-	if( sum < (maxValue - minValue) * size / 10)
-		listOrder=NEARLY_SORTED;
-	else 
-		listOrder=DISORDERLY;
-
-	CUDA_SAFE_CALL(cudaFree(dDiffData));
-}
-
-template <typename T>
-void bbSortBody(T* dData,int size,int listOrder/*,float sliceStep,int sliceSize, T* dTmpData, float minValue,float maxValue*/)
-{
-	float minValue,maxValue;
-	T*  dTmpData;
-
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dTmpData, sizeof(T) * size));
-	CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
-	reduceMinMax(dTmpData,size,maxValue,true);
-	CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
-	reduceMinMax(dTmpData,size,minValue,false);
-
-	if(minValue == maxValue)
-	{
-		CUDA_SAFE_CALL(cudaFree(dTmpData));
-		return ;
-	}
-
-	if(listOrder == AUTO_EVALUATE )
-	{
-		evaluateDisorder(dData,size,maxValue,minValue,listOrder);
-	}
-	
-	float sliceStep = (float) (50.0*((double)(maxValue-minValue)/(double)size));
-	int sliceSize = (int) ((maxValue-minValue)/sliceStep + 10);
-
-	int blockCount;
-
-	if(size%BLOCK_SIZE==0)blockCount=size/BLOCK_SIZE;
-	else blockCount=size/BLOCK_SIZE+1;
-
-	unsigned int* dSliceCounts;
-	unsigned int* dOffsetInSlice;
-
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dOffsetInSlice, sizeof(unsigned int) * size));
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceCounts, sizeof(unsigned int) * sliceSize));
-	CUDA_SAFE_CALL(cudaMemset(dSliceCounts,0, sizeof(int) * sliceSize));
-
-	if(listOrder == NEARLY_SORTED)
-	{
-		assignElementToSlicesNearlySortedD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize,blockCount);
-	}
-	else 
-		assignElementToSlicesD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize);
-	unsigned int* hSliceCounts=new unsigned int[sliceSize];
-	CUDA_SAFE_CALL(cudaMemcpy(hSliceCounts, dSliceCounts, sizeof(unsigned int) * sliceSize, cudaMemcpyDeviceToHost));
-
-	int looseBucketSize=size/100;
-
-	unsigned int* hBucketOffsets=new unsigned int[looseBucketSize];
-	unsigned int* hBucketSizes=new unsigned int[looseBucketSize];
-	unsigned int* hBucketOfSlices=new unsigned int[sliceSize];
-	unsigned int* hSliceOffsetInBucket=new unsigned int[sliceSize];
-	int bucketsCount=0;
-
-	memset(hBucketSizes,0,sizeof(int) * looseBucketSize);
-	memset(hSliceOffsetInBucket,0,sizeof(unsigned int) * sliceSize);
-
-	bool overflow;
-
-	overflow = assignSliceToBuckets(hSliceCounts,sliceSize,hBucketOffsets,hBucketOfSlices,hBucketSizes,hSliceOffsetInBucket,bucketsCount,sliceStep);
-
-	unsigned int* dBucketOffsets;
-	unsigned int* dBucketSizes;
-
-	unsigned int* dBucketOfSlices;
-	unsigned int* dSliceOffsetInBucket;
-
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOfSlices, sizeof(unsigned int) * sliceSize));
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceOffsetInBucket, sizeof(unsigned int) * sliceSize));
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOffsets, sizeof(unsigned int) * bucketsCount));
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketSizes, sizeof(unsigned int) * bucketsCount));
-
-
-	CUDA_SAFE_CALL(cudaMemcpy(dBucketOfSlices, hBucketOfSlices, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpy(dSliceOffsetInBucket, hSliceOffsetInBucket, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpy(dBucketOffsets, hBucketOffsets, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpy(dBucketSizes, hBucketSizes, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
-
-	cudaBindTexture(0,tBucketOffsets,dBucketOffsets);
-	cudaBindTexture(0,tBucketSizes,dBucketSizes);
-	cudaBindTexture(0,tBucketOfSlices,dBucketOfSlices);
-	cudaBindTexture(0,tSliceOffsetInBucket,dSliceOffsetInBucket);
-
-	assignElementToBucketD<<<blockCount, BLOCK_SIZE>>>(dData,dTmpData,size,dOffsetInSlice,minValue,sliceStep);
-
-	CUDA_SAFE_CALL( cudaThreadSynchronize() );
-
-	bitonicSortD<<<bucketsCount, BLOCK_SIZE, sizeof(T) * BLOCK_SIZE>>>(dTmpData);
-
-    CUDA_SAFE_CALL(cudaMemcpy(dData, dTmpData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
-	
-	if(overflow){
-		for(int i=0;i<bucketsCount;i++)
-		{
-			if(hBucketSizes[i] > BLOCK_SIZE)
-			{
-				bbSort(dData + hBucketOffsets[i],hBucketSizes[i],listOrder);
-			}
-		}
-	}
-	
-	delete hBucketOffsets;
-	delete hBucketOfSlices;
-	delete hSliceCounts;
-	delete hBucketSizes;
-	delete hSliceOffsetInBucket;
-
-	CUDA_SAFE_CALL(cudaFree(dOffsetInSlice));
-	CUDA_SAFE_CALL(cudaFree(dSliceCounts));
-	CUDA_SAFE_CALL(cudaFree(dTmpData));
-
-	cudaUnbindTexture( tBucketSizes );
-	CUDA_SAFE_CALL(cudaFree(dBucketSizes));
-
-	cudaUnbindTexture( tBucketOffsets );
-	CUDA_SAFE_CALL(cudaFree(dBucketOffsets));
-
-	cudaUnbindTexture( tBucketOfSlices );
-	CUDA_SAFE_CALL(cudaFree(dBucketOfSlices));
-
-	cudaUnbindTexture( tSliceOffsetInBucket );
-	CUDA_SAFE_CALL(cudaFree(dSliceOffsetInBucket));
-}
-
-/************************************************************************************
-
-Uncomment your desired function definition here
-
-Please note that, only one type of bbsort() can be used in a program, due to NVCC compiler doesn't support overriding kernel function
-
-float, double, int, uint, short, and ushort are originally supported, if you want to use bbsort() in double
-
-please follow the readme.txt
-
-Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile command) to sort doubles
-
-*************************************************************************************/
-
-template<>
-void OPENMMCUDA_EXPORT bbSort(int2* dData,int size,int listOrder)
-{
-
-	bbSortBody(dData,size,listOrder);
-}
-
-//void bbSort(float* dData,int size,int listOrder)
-//{
-//
-//	bbSortBody(dData,size,listOrder);
-//}
-
-//void bbSort(int* dData,int size,int listOrder)
-//{
-//
-//	bbSortBody(dData,size,listOrder);
-//}
-//
-//void bbSort(unsigned int* dData,int size,int listOrder)
-//{
-//
-//	bbSortBody(dData,size,listOrder);
-//}
-//
-//void bbSort(double* dData,int size,int listOrder)
-//{
-//
-//	bbSortBody(dData,size,listOrder);
-//}
--- a/platforms/cuda-old/src/kernels/bbsort.h
+++ b/platforms/cuda-old/src/kernels/bbsort.h
-/*
- * Authored by: Chen, Shifu
- * 
- * Email: chen@gmtk.org
- *
- * Website: http://www.gmtk.org/gsort
- *
- * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
- * 
- */
-#ifndef _BBSORT_H_
-#define _BBSORT_H_
-#include "windowsExportCuda.h"
-
-#define BLOCK_SIZE 512
-
-#define DISORDERLY 0
-#define NEARLY_SORTED 1
-#define AUTO_EVALUATE 2
-
-template <typename T>
-void OPENMMCUDA_EXPORT bbSort(T* dData,int number,int listOrder=AUTO_EVALUATE);
-
-#endif // _BBSORT_H_
--- a/platforms/cuda-old/src/kernels/bbsort_kernel.cu
+++ b/platforms/cuda-old/src/kernels/bbsort_kernel.cu
-/*
- * Authored by: Chen, Shifu
- * 
- * Email: chen@gmtk.org
- *
- * Website: http://www.gmtk.org/gsort
- *
- * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
- * 
- */
-#ifndef _BBSORT_KERNEL_H_
-#define _BBSORT_KERNEL_H_
-
-#include "bbsort.h"
-#include "math_constants.h"
-
-texture<unsigned int, 1, cudaReadModeElementType> tBucketSizes;
-texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
-texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices;
-texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket;
-
-static __device__ int dGetValue(int2 v){
-	return v.y;
-}
-
-template <typename T>
-static __device__ T dGetValue(T v){
-	return v;
-}
-
-
-static __device__ void dPad(int2& v){
-	v.x=0x3fffffff;
-	v.y=0x4fffffff;
-}
-
-template <typename T>
-static __device__ void dPad(T & v){
-	v=0x7fffffff;
-}
-
-template <typename T>
-__global__ static void reduceMaxD(T * dData,int step,int length)
-{
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if(index + step >=length)
-		return ;
-	dData[index] = dGetValue(dData[index])>dGetValue(dData[index+step])?dData[index]:dData[index+step];
-}
-
-template <typename T>
-__global__ static void reduceMinD(T * dData,int step,int length)
-{
-	
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if(index + step >=length)
-		return ;
-
-	dData[index] = dGetValue(dData[index])<dGetValue(dData[index+step])?dData[index]:dData[index+step];
-}
-
-__global__ static void reduceSumD(float * dDiffData,int step,int length)
-{
-
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if(index + step >=length)
-		return ;
-
-	dDiffData[index] += dDiffData[index+step];
-}
-
-template <typename T>
-__global__ static void calDifferenceD(T * dData,float * dDiffData,int size)
-{
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if(index > size-1)
-		return ;
-
-    const unsigned int tid = threadIdx.x;
-	
-	extern __shared__ T sData[];
-
-	sData[tid]=dData[index];
-
-	__syncthreads();
-	
-	if(tid < blockDim.x -1)
-		dDiffData[index] = abs(dGetValue(sData[tid+1]) - dGetValue(sData[tid]));
-	else 
-		dDiffData[index] =0;
-	
-}
-
-template <typename T>
-__device__ inline void dSwap(T & a, T & b)
-{
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-
-template <typename T>
-__global__ static void bitonicSortD(T * datas)
-{
-    extern __shared__ T shared[];
-
-	const unsigned int bid=blockIdx.x;
-
-    const unsigned int tid = threadIdx.x;
-
-	__shared__ unsigned int count;
-	__shared__ unsigned int offset;
-
-	if(tid == 0)
-	{
-		count=tex1Dfetch(tBucketSizes,bid);
-		offset=tex1Dfetch(tBucketOffsets,bid);
-	}
-
-	__syncthreads();
-
-    if(tid < count)
-		shared[tid] = datas[tid+offset];
-	else 
-	{
-		dPad(shared[tid]);
-	}
-
-    __syncthreads();
-
-    for (unsigned int k = 2; k <= BLOCK_SIZE; k *= 2)
-    {
-        for (unsigned int j = k / 2; j>0; j /= 2)
-        {
-            unsigned int ixj = tid ^ j;
-            
-
-            if (ixj > tid)
-            {
-                if ((tid & k) == 0)
-                {
-                    if (dGetValue(shared[tid]) > dGetValue(shared[ixj]))
-                    {
-                        dSwap(shared[tid], shared[ixj]);
-                    }
-                }
-                else
-                {
-                    if (dGetValue(shared[tid]) < dGetValue(shared[ixj]))
-                    {
-                        dSwap(shared[tid], shared[ixj]);
-                    }
-                }
-            }
-            
-            __syncthreads();
-        }
-    }
-    if(tid < count)
-		datas[tid+offset] = shared[tid];
-}
-
-template <typename T>
-
-__global__ void assignElementToSlicesD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize)
-{
-	unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-
-	if(index > number-1)
-		return ;
-
-	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
-
-	unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
-
-	dOffsetInSlice[index] = offset;
-
-}
-
-template <typename T>
-__global__ void assignElementToSlicesNearlySortedD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize,int blockCount)
-{
-	unsigned int index= blockIdx.x + blockCount * threadIdx.x;
-
-	if(index > number-1)
-		return ;
-
-	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
-
-	unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
-
-	dOffsetInSlice[index] = offset;
-
-}
-
-template <typename T>
-__global__ void assignElementToBucketD(T* dDatas,T*  dNewDatas,int number,unsigned int* dOffsetInSlice,float minValue,float step)
-{
-
-	unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-
-	if(index > number-1)
-		return ;
-
-	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
-
-	unsigned int b=tex1Dfetch(tBucketOfSlices,s);
-
-	unsigned int offset =tex1Dfetch(tBucketOffsets,b) + tex1Dfetch(tSliceOffsetInBucket,s) + dOffsetInSlice[index];
-
-	dNewDatas[offset] =dDatas[index];
-
-}
-
-#endif // _BBSORT_KERNEL_H_
--- a/platforms/cuda-old/src/kernels/cudaCompact.cu
+++ b/platforms/cuda-old/src/kernels/cudaCompact.cu
-
-/* Code for CUDA stream compaction. Roughly based on:
-    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
-        High Performance Graphics 2009.
-
-    Notes:
-        - paper recommends 128 threads/block, so this is hard coded.
-        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
-          complicated and performs poorly on current hardware
-        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
-          performance across most of the tested workloads in the paper. The selective variant is not
-          implemented.
-        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
-          manner. It is, however, done in a very easy to program manner, and integrated into the top of
-          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
-          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
-          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
-          dgBlockCounts in stages.
-
-  Date:         23 Aug 2009
-  Author:       Imran Haque (ihaque@cs.stanford.edu)
-  Affiliation:  Stanford University
-  License:      Public Domain
-*/
-
-#include "cudaCompact.h"
-
-typedef unsigned int T;
-
-// Phase 1: Count valid elements per thread block
-// Hard-code 128 thd/blk
-__device__ unsigned int sumReduce128(volatile unsigned int* arr) {
-    // Parallel reduce element counts
-    // Assumes 128 thd/block
-    if (threadIdx.x < 64) arr[threadIdx.x] += arr[threadIdx.x+64];
-    __syncthreads();
-    if (threadIdx.x < 32) {
-        arr[threadIdx.x] += arr[threadIdx.x+32];
-        if (threadIdx.x < 16) arr[threadIdx.x] += arr[threadIdx.x+16];
-        if (threadIdx.x < 8) arr[threadIdx.x] += arr[threadIdx.x+8];
-        if (threadIdx.x < 4) arr[threadIdx.x] += arr[threadIdx.x+4];
-        if (threadIdx.x < 2) arr[threadIdx.x] += arr[threadIdx.x+2];
-        if (threadIdx.x < 1) arr[threadIdx.x] += arr[threadIdx.x+1];
-    }
-    __syncthreads();
-    return arr[0];
-}
-
-__global__ void countElts(unsigned int* dgBlockCounts,const unsigned int* dgValid,const size_t eltsPerBlock,const size_t len) {
-    __shared__ volatile unsigned int dsCount[128];
-    dsCount[threadIdx.x] = 0;
-    size_t ub;
-    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
-    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
-        if ((base + threadIdx.x) < ub && dgValid[base+threadIdx.x])
-            dsCount[threadIdx.x]++;
-    }
-    __syncthreads();
-    unsigned int blockCount = sumReduce128(dsCount);
-    if (threadIdx.x == 0) dgBlockCounts[blockIdx.x] = blockCount;
-    return;
-}
-
-// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
-// Exclusive prefix scan over 128 elements
-// Assumes 128 threads
-// Taken from cuda SDK "scan" sample for naive scan, with small modifications
-__device__ int exclusivePrescan128(const unsigned int* in,unsigned int* outAndTemp) {
-    const int n=128;
-    //TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
-    //__shared__ int temp[2*n];
-    unsigned int* temp = outAndTemp;
-    int pout = 1, pin = 0;
-
-    // load input into temp
-    // This is exclusive scan, so shift right by one and set first elt to 0
-    temp[pout*n + threadIdx.x] = (threadIdx.x > 0) ? in[threadIdx.x-1] : 0;
-    __syncthreads();
-
-    for (int offset = 1; offset < n; offset *= 2)
-    {
-        pout = 1 - pout; // swap double buffer indices
-        pin  = 1 - pout;
-        __syncthreads();
-        temp[pout*n+threadIdx.x] = temp[pin*n+threadIdx.x];
-        if (threadIdx.x >= offset)
-            temp[pout*n+threadIdx.x] += temp[pin*n+threadIdx.x - offset];
-    }
-
-    //out[threadIdx.x] = temp[pout*n+threadIdx.x]; // write output
-    __syncthreads();
-    return outAndTemp[127]+in[127]; // Return sum of all elements
-}
-__device__ int compactSIMDPrefixSum(const T* dsData,const unsigned int* dsValid,T* dsCompact) {
-    __shared__ unsigned int dsLocalIndex[256];
-    int numValid = exclusivePrescan128(dsValid,dsLocalIndex);
-    if (dsValid[threadIdx.x]) dsCompact[dsLocalIndex[threadIdx.x]] = dsData[threadIdx.x];
-    return numValid;
-}
-
-__global__ void moveValidElementsStaged(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
-    __shared__ T inBlock[128];
-    __shared__ unsigned int validBlock[128];
-    __shared__ T compactBlock[128];
-    int blockOutOffset=0;
-    // Sum up the blockCounts before us to find our offset
-    // This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
-    // Paper implements this as a prefix sum kernel in phase II
-    // May still be faster than an extra kernel invocation?
-    for (int base = 0; base < blockIdx.x; base += blockDim.x) {
-        // Load up the count of valid elements for each block before us in batches of 128
-        if ((base + threadIdx.x) < blockIdx.x) {
-            validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
-        } else {
-            validBlock[threadIdx.x] = 0;
-        }
-        __syncthreads();
-        // Parallel reduce these counts
-        // Accumulate in the final offset variable
-        blockOutOffset += sumReduce128(validBlock);
-    }
-
-    size_t ub;
-    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
-    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
-        if ((base + threadIdx.x) < ub) {
-            validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
-            inBlock[threadIdx.x] = dgData[base+threadIdx.x];
-        } else {
-            validBlock[threadIdx.x] = 0;
-        }
-        __syncthreads();
-        int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
-        __syncthreads();
-        if (threadIdx.x < numValidBlock) {
-            dgCompact[blockOutOffset + threadIdx.x] = compactBlock[threadIdx.x];
-        }
-        blockOutOffset += numValidBlock;
-    }
-    if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
-        *dNumValidElements = blockOutOffset;
-    }
-}
-
-__global__ void moveValidElementsScattered(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
-    __shared__ T inBlock[128];
-    __shared__ unsigned int validBlock[128];
-    T* compactBlock=dgCompact;
-    size_t blockOutOffset = 0;
-    // Sum up the blockCounts before us to find our offset
-    // This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
-    // Paper implements this as a prefix sum kernel in phase II
-    // May still be faster than an extra kernel invocation?
-    for (int base = 0; base < blockIdx.x; base += blockDim.x) {
-        // Load up the count of valid elements for each block before us in batches of 128
-        if ((base + threadIdx.x) < blockIdx.x) {
-            validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
-        } else {
-            validBlock[threadIdx.x] = 0;
-        }
-        __syncthreads();
-        // Parallel reduce these counts
-        // Accumulate in the final offset variable
-        blockOutOffset += sumReduce128(validBlock);
-    }
-    compactBlock += blockOutOffset;
-    size_t ub;
-    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
-    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
-        if ((base + threadIdx.x) < ub) {
-            validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
-            inBlock[threadIdx.x] = dgData[base+threadIdx.x];
-        } else {
-            validBlock[threadIdx.x] = 0;
-        }
-        __syncthreads();
-        int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
-        blockOutOffset += numValidBlock;
-        compactBlock += numValidBlock;
-    }
-    if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
-        *dNumValidElements = blockOutOffset;
-    }
-}
-
-void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput) {
-    int device;
-    cudaGetDevice(&device);
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, device);
-    d.nThreadBlocks = 16*deviceProp.multiProcessorCount;
-    cudaMalloc((void**)&(d.dgBlockCounts), d.nThreadBlocks*sizeof(unsigned int));
-    d.stageOutput = stageOutput;
-    // TODO: make sure allocation worked
-    d.valid = true;
-}
-
-void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d) {
-    if (d.valid) cudaFree(d.dgBlockCounts);
-}
-
-int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,T* dOut,const T* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid) {
-    if (!d.valid) {
-        return -1;
-    }
-    // Figure out # elements per block
-    unsigned int numBlocks = d.nThreadBlocks;
-    if (numBlocks*128 > len)
-        numBlocks = (len+127)/128;
-    const size_t eltsPerBlock = len/numBlocks + ((len % numBlocks) ? 1 : 0);
-
-    // TODO: implement loop over blocks of 10M
-    // Phase 1: Calculate number of valid elements per thread block
-    countElts<<<numBlocks,128>>>(d.dgBlockCounts,dValid,eltsPerBlock,len);
-
-    // Phase 2/3: Move valid elements using SIMD compaction
-    if (d.stageOutput) {
-        moveValidElementsStaged<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
-    } else {
-        moveValidElementsScattered<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
-    }
-    return 0;
-}
--- a/platforms/cuda-old/src/kernels/cudaCompact.h
+++ b/platforms/cuda-old/src/kernels/cudaCompact.h
-#ifndef __OPENMM_CUDACOMPACT_H__
-#define __OPENMM_CUDACOMPACT_H__
-
-/* Code for CUDA stream compaction. Roughly based on:
-    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
-        High Performance Graphics 2009.
-
-    Notes:
-        - paper recommends 128 threads/block, so this is hard coded.
-        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
-          complicated and performs poorly on current hardware
-        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
-          performance across most of the tested workloads in the paper. The selective variant is not
-          implemented.
-        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
-          manner. It is, however, done in a very easy to program manner, and integrated into the top of
-          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
-          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
-          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
-          dgBlockCounts in stages.
-
-  Date:         23 Aug 2009
-  Author:       Imran Haque (ihaque@cs.stanford.edu)
-  Affiliation:  Stanford University
-  License:      Public Domain
-*/
-
-#include "windowsExportCuda.h"
-
-struct compactionPlan {
-    bool valid;
-    unsigned int* dgBlockCounts;
-    unsigned int nThreadBlocks;
-    bool stageOutput;
-};
-
-extern "C"
-void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput=true);
-
-extern "C"
-void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d);
-
-extern "C"
-int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,unsigned int* dOut,const unsigned int* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid);
-
-#endif // __OPENMM_CUDACOMPACT_H__
--- a/platforms/cuda-old/src/kernels/cudaKernels.h
+++ b/platforms/cuda-old/src/kernels/cudaKernels.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "gputypes.h"
-
-// Initialization
-extern void OPENMMCUDA_EXPORT kClearForces(gpuContext gpu);
-extern void kClearEnergy(gpuContext gpu);
-extern void kClearBornSumAndForces(gpuContext gpu);
-extern void kClearObcGbsaBornSum(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT kReduceObcGbsaBornSum(gpuContext gpu);
-extern void kCalculateGBVIBornSum(gpuContext gpu);
-extern void kReduceGBVIBornSum(gpuContext gpu);
-extern void kClearGBVIBornSum( gpuContext gpu );
-extern void kGenerateRandoms(gpuContext gpu);
-
-// Main loop
-extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
-extern void kCalculateCDLJGBVIForces1(gpuContext gpu);
-extern void kCalculateCDLJForces(gpuContext gpu);
-extern void kCalculateCMAPTorsionForces(gpuContext gpu, CUDAStream<float4>& coefficients, CUDAStream<int2>& mapPositions, CUDAStream<int4>& torsionIndices, CUDAStream<int>& torsionMaps);
-extern void kCalculateCustomBondForces(gpuContext gpu);
-extern void kCalculateCustomAngleForces(gpuContext gpu);
-extern void kCalculateCustomTorsionForces(gpuContext gpu);
-extern void kCalculateCustomExternalForces(gpuContext gpu);
-extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
-extern void kReduceObcGbsaBornForces(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT kCalculateObcGbsaForces2(gpuContext gpu);
-extern void kCalculateGBVIForces2(gpuContext gpu);
-extern void kCalculateLocalForces(gpuContext gpu);
-extern void kCalculateAndersenThermostat(gpuContext gpu, CUDAStream<int>& atomGroups);
-extern void kReduceBornSumAndForces(gpuContext gpu);
-extern void kApplyShake(gpuContext gpu);
-extern void kApplyCCMA(gpuContext gpu);
-extern void kApplySettle(gpuContext gpu);
-extern void kLangevinUpdatePart1(gpuContext gpu);
-extern void kLangevinUpdatePart2(gpuContext gpu);
-extern void kSelectLangevinStepSize(gpuContext gpu, float maxTimeStep);
-extern void kSetVelocitiesFromPositions(gpuContext gpu);
-extern void kVerletUpdatePart1(gpuContext gpu);
-extern void kVerletUpdatePart2(gpuContext gpu);
-extern void kSelectVerletStepSize(gpuContext gpu, float maxTimeStep);
-extern void kBrownianUpdatePart1(gpuContext gpu);
-extern void kBrownianUpdatePart2(gpuContext gpu);
-extern void kScaleAtomCoordinates(gpuContext gpu, float scale, CUDAStream<int>& moleculeAtoms, CUDAStream<int>& moleculeStartIndex);
-extern void kApplyConstraints(gpuContext gpu);
-
-// Extras
-extern void OPENMMCUDA_EXPORT kReduceForces(gpuContext gpu);
-extern double kReduceEnergy(gpuContext gpu);
-
-// Initializers
-extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
-extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
-extern void SetCalculateCDLJForcesSim(gpuContext gpu);
-extern void GetCalculateCDLJForcesSim(gpuContext gpu);
-extern void SetCalculateCustomBondForcesSim(gpuContext gpu);
-extern void GetCalculateCustomBondForcesSim(gpuContext gpu);
-extern void SetCalculateCustomAngleForcesSim(gpuContext gpu);
-extern void GetCalculateCustomAngleForcesSim(gpuContext gpu);
-extern void SetCalculateCustomTorsionForcesSim(gpuContext gpu);
-extern void GetCalculateCustomTorsionForcesSim(gpuContext gpu);
-extern void SetCalculateCustomExternalForcesSim(gpuContext gpu);
-extern void GetCalculateCustomExternalForcesSim(gpuContext gpu);
-extern void SetCalculateCustomNonbondedForcesSim(gpuContext gpu);
-extern void GetCalculateCustomNonbondedForcesSim(gpuContext gpu);
-extern void SetCalculateLocalForcesSim(gpuContext gpu);
-extern void GetCalculateLocalForcesSim(gpuContext gpu);
-extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
-extern void GetCalculateObcGbsaBornSumSim(gpuContext gpu);
-extern void SetCalculateGBVIBornSumSim(gpuContext gpu);
-extern void GetCalculateGBVIBornSumSim(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
-extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
-extern void SetCalculateGBVIForces2Sim(gpuContext gpu);
-extern void GetCalculateGBVIForces2Sim(gpuContext gpu);
-extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
-extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
-extern void SetCalculatePMESim(gpuContext gpu);
-extern void GetCalculatePMESim(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
-extern void GetForcesSim(gpuContext gpu);
-extern void SetShakeHSim(gpuContext gpu);
-extern void GetShakeHSim(gpuContext gpu);
-extern void SetLangevinUpdateSim(gpuContext gpu);
-extern void GetLangevinUpdateSim(gpuContext gpu);
-extern void SetSettleSim(gpuContext gpu);
-extern void GetSettleSim(gpuContext gpu);
-extern void SetCCMASim(gpuContext gpu);
-extern void GetCCMASim(gpuContext gpu);
-extern void SetVerletUpdateSim(gpuContext gpu);
-extern void GetVerletUpdateSim(gpuContext gpu);
-extern void SetBrownianUpdateSim(gpuContext gpu);
-extern void GetBrownianUpdateSim(gpuContext gpu);
-extern void SetRandomSim(gpuContext gpu);
-extern void GetRandomSim(gpuContext gpu);
-extern void SetCustomBondForceExpression(const Expression<256>& expression);
-extern void SetCustomBondEnergyExpression(const Expression<256>& expression);
-extern void SetCustomBondGlobalParams(const std::vector<float>&  paramValues);
-extern void SetCustomAngleForceExpression(const Expression<256>& expression);
-extern void SetCustomAngleEnergyExpression(const Expression<256>& expression);
-extern void SetCustomAngleGlobalParams(const std::vector<float>&  paramValues);
-extern void SetCustomTorsionForceExpression(const Expression<256>& expression);
-extern void SetCustomTorsionEnergyExpression(const Expression<256>& expression);
-extern void SetCustomTorsionGlobalParams(const std::vector<float>&  paramValues);
-extern void SetCustomExternalForceExpressions(const Expression<256>& expressionX, const Expression<256>& expressionY, const Expression<256>& expressionZ);
-extern void SetCustomExternalEnergyExpression(const Expression<256>& expression);
-extern void SetCustomExternalGlobalParams(const std::vector<float>& paramValues);
-extern void SetCustomNonbondedForceExpression(const Expression<256>& expression);
-extern void SetCustomNonbondedEnergyExpression(const Expression<256>& expression);
-extern void SetCustomNonbondedGlobalParams(const std::vector<float>& paramValues);
-
-extern void kPrintGBVI( gpuContext gpu, std::string callId, int call, FILE* log);
-extern void kPrintObc( gpuContext gpu, std::string callId, int call, FILE* log);
-
--- a/platforms/cuda-old/src/kernels/cudatypes.h
+++ b/platforms/cuda-old/src/kernels/cudatypes.h
-#ifndef CUDATYPES_H
-#define CUDATYPES_H
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdarg.h>
-#include <limits>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <cufft.h>
-#include <builtin_types.h>
-#include <vector_functions.h>
-#include "openmm/OpenMMException.h"
-
-#define RTERROR(status, s) \
-    if (status != cudaSuccess) { \
-        throw OpenMM::OpenMMException(std::string(s) + " " + cudaGetErrorString(status)); \
-    }
-
-#define LAUNCHERROR(s) \
-    { \
-        cudaError_t status = cudaGetLastError(); \
-        if (status != cudaSuccess) { \
-            throw OpenMM::OpenMMException(std::string("Error: ") + cudaGetErrorString(status) + " launching kernel " + s); \
-        } \
-    }
-
-// Pure virtual class to define an interface for objects resident both on GPU and CPU
-struct SoADeviceObject {
-    virtual void Allocate() = 0;
-    virtual void Deallocate() = 0;
-    virtual void Upload() = 0;
-    virtual void Download() = 0;
-};
-
-template <typename T>
-struct CUDAStream : public SoADeviceObject
-{
-    unsigned int    _length;
-    unsigned int    _subStreams;
-    unsigned int    _stride;
-    T**             _pSysStream;
-    T**             _pDevStream;
-    T*              _pSysData;
-    T*              _pDevData;
-    std::string     _name;
-    CUDAStream(int length, int subStreams = 1, std::string name="");
-    CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
-    CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
-    CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
-    virtual ~CUDAStream();
-    void Allocate();
-    void Deallocate();
-    void Upload();
-    void Download();
-    void CopyFrom(const CUDAStream<T>& src);
-    void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
-    T& operator[](int index);
-};
-
-float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
-
-template <typename T>
-CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
-{
-    Allocate();   
-}
-
-template <typename T>
-CUDAStream<T>::~CUDAStream()
-{
-    Deallocate();
-}
-
-template <typename T>
-void CUDAStream<T>::Allocate()
-{
-    cudaError_t status;
-    _pSysStream =   new T*[_subStreams];
-    _pDevStream =   new T*[_subStreams];
-    _pSysData =     new T[_subStreams * _stride];
-
-    status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
-    RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());
-
-    for (unsigned int i = 0; i < _subStreams; i++)
-    {
-        _pSysStream[i] = _pSysData + i * _stride;
-        _pDevStream[i] = _pDevData + i * _stride;
-    }
-}
-
-template <typename T>
-void CUDAStream<T>::Deallocate()
-{
-    cudaError_t status;
-    delete[] _pSysStream;
-    _pSysStream = NULL;
-    delete[] _pDevStream;
-    _pDevStream = NULL;
-    delete[] _pSysData;
-    _pSysData = NULL;
-    status = cudaFree(_pDevData);
-    RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Upload()
-{
-    cudaError_t status;
-    status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
-    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Download()
-{
-    cudaError_t status;
-    status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
-    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::CopyFrom(const CUDAStream<T>& src)
-{
-    cudaError_t status;
-    status = cudaMemcpy(_pDevData, src._pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToDevice);
-    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Copy failed").c_str());
-}
-
-template <typename T>
-void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
-{
-    T* pTemp = new T[_subStreams * _stride];
-    unsigned int stream = 0;
-    unsigned int pos = 0;
-    unsigned int newstride = _stride * _subStreams / newstreams;
-    unsigned int newlength = _length * _subStreams / newstreams;
-
-    // Copy data into new format
-    for (unsigned int i = 0; i < _length; i++)
-    {
-        for (unsigned int j = 0; j < _subStreams; j++)
-        {
-            pTemp[stream * newstride + pos] = _pSysStream[j][i];
-            stream++;
-            if (stream == newstreams)
-            {
-                stream = 0;
-                pos++;
-            }
-        }
-    }
-
-    // Remap stream pointers;
-    for (unsigned int i = 0; i < newstreams; i++)
-    {
-        _pSysStream[i] = _pSysData + i * newstride;
-        _pDevStream[i] = _pDevData + i * newstride;
-    }
-
-    // Copy data back intro original stream
-    for (unsigned int i = 0; i < newlength; i++)
-        for (unsigned int j = 0; j < newstreams; j++)
-            _pSysStream[j][i] = pTemp[j * newstride + i];
-    
-    _stride = newstride;
-    _length = newlength;
-    _subStreams = newstreams;
-    delete[] pTemp;
-}
-
-template <typename T>
-T& CUDAStream<T>::operator[](int index)
-{
-    return _pSysData[index];
-}
-
-static const unsigned int GRID = 32;
-static const unsigned int GRIDBITS = 5;
-static const int G8X_BLOCKS_PER_SM                      = 1;
-static const int GT2XX_BLOCKS_PER_SM                    = 1;
-static const int GF1XX_BLOCKS_PER_SM                    = 1;
-
-static const int G8X_NONBOND_THREADS_PER_BLOCK          = 256;
-static const int GT2XX_NONBOND_THREADS_PER_BLOCK        = 320;
-static const int GF1XX_NONBOND_THREADS_PER_BLOCK        = 768;
-//static const int GF1XX_NONBOND_THREADS_PER_BLOCK        = 768;
-
-static const int G8X_BORNFORCE2_THREADS_PER_BLOCK       = 256;
-static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK     = 320;
-static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK     = 768;
-//static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK     = 768;
-
-static const int G8X_SHAKE_THREADS_PER_BLOCK            = 128;
-static const int GT2XX_SHAKE_THREADS_PER_BLOCK          = 256;
-static const int GF1XX_SHAKE_THREADS_PER_BLOCK          = 512;
-
-static const int G8X_UPDATE_THREADS_PER_BLOCK           = 192;
-static const int GT2XX_UPDATE_THREADS_PER_BLOCK         = 384;
-static const int GF1XX_UPDATE_THREADS_PER_BLOCK         = 768;
-
-static const int G8X_LOCALFORCES_THREADS_PER_BLOCK      = 192;
-static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK    = 384;
-static const int GF1XX_LOCALFORCES_THREADS_PER_BLOCK    = 768;
-
-static const int G8X_THREADS_PER_BLOCK                  = 256;
-static const int GT2XX_THREADS_PER_BLOCK                = 256;
-static const int GF1XX_THREADS_PER_BLOCK                = 512;
-
-static const int G8X_RANDOM_THREADS_PER_BLOCK           = 256;
-static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
-static const int GF1XX_RANDOM_THREADS_PER_BLOCK         = 768;
-
-static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
-static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;
-static const int GF1XX_NONBOND_WORKUNITS_PER_SM         = 768;
-
-static const unsigned int MAX_STACK_SIZE = 8;
-static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
-
-static const float PI = 3.14159265358979323846f;
-
-static const int PME_ORDER = 5;
-
-enum CudaNonbondedMethod
-{
-    NO_CUTOFF,
-    CUTOFF,
-    PERIODIC,
-    EWALD,
-    PARTICLE_MESH_EWALD
-};
-
-enum ExpressionOp {
-    VARIABLE0 = 0, VARIABLE1, VARIABLE2, VARIABLE3, VARIABLE4, VARIABLE5, VARIABLE6, VARIABLE7, VARIABLE8, MULTIPLY, DIVIDE, ADD, SUBTRACT, POWER, MULTIPLY_CONSTANT, POWER_CONSTANT, ADD_CONSTANT,
-        GLOBAL, CONSTANT, CUSTOM, CUSTOM_DERIV, NEGATE, RECIPROCAL, SQRT, EXP, LOG, SQUARE, CUBE, STEP, SIN, COS, SEC, CSC, TAN, COT, ASIN, ACOS, ATAN, SINH, COSH, TANH, ERF, ERFC,
-        MIN, MAX, ABS
-};
-
-template<int SIZE>
-struct Expression {
-    int op[SIZE];
-    float arg[SIZE];
-    int length, stackSize;
-};
-
-struct cudaGmxSimulation {
-    // Constants
-    unsigned int    atoms;                          // Number of atoms
-    unsigned int    paddedNumberOfAtoms;            // Padded number of atoms
-    unsigned int    blocks;                         // Number of blocks to launch across linear kernels
-    unsigned int    blocksPerSM;                    // Number of blocks per share memory
-    unsigned int    nonbond_blocks;                 // Number of blocks to launch across CDLJ and Born Force Part1
-    unsigned int    bornForce2_blocks;              // Number of blocks to launch across Born Force 2
-    unsigned int    interaction_blocks;             // Number of blocks to launch when identifying interacting tiles
-    unsigned int    threads_per_block;              // Threads per block to launch
-    unsigned int    nonbond_threads_per_block;      // Threads per block in nonbond kernel calls
-    unsigned int    bornForce2_threads_per_block;   // Threads per block in nonbond kernel calls
-    unsigned int    max_update_threads_per_block;   // Maximum threads per block in update kernel calls
-    unsigned int    update_threads_per_block;       // Threads per block in update kernel calls
-    unsigned int    bf_reduce_threads_per_block;    // Threads per block in Born Force reduction calls
-    unsigned int    bsf_reduce_threads_per_block;   // Threads per block in Born Sum And Forces reduction calls
-    unsigned int    max_shake_threads_per_block;    // Maximum threads per block in shake kernel calls
-    unsigned int    shake_threads_per_block;        // Threads per block in shake kernel calls
-    unsigned int    settle_threads_per_block;       // Threads per block in SETTLE kernel calls
-    unsigned int    ccma_threads_per_block;         // Threads per block in CCMA kernel calls
-    unsigned int    max_localForces_threads_per_block;  // Threads per block in local forces kernel calls
-    unsigned int    localForces_threads_per_block;  // Threads per block in local forces kernel calls
-    unsigned int    random_threads_per_block;       // Threads per block in RNG kernel calls
-    unsigned int    interaction_threads_per_block;  // Threads per block when identifying interacting tiles
-    unsigned int    custom_exception_threads_per_block; // Threads per block in custom nonbonded exception kernel calls
-    unsigned int    customExpressionStackSize;      // Stack size for evaluating custom nonbonded forces
-    unsigned int    workUnits;                      // Number of work units
-    unsigned int*   pWorkUnit;                      // Pointer to work units
-    unsigned int*   pInteractingWorkUnit;           // Pointer to work units that have interactions
-    unsigned int*   pInteractionFlag;               // Flags for which work units have interactions
-    float2*         pStepSize;                      // The size of the previous and current time steps
-    float*          pLangevinParameters;            // Parameters used for Langevin integration
-    float           errorTol;                       // Error tolerance for selecting the step size
-    size_t*         pInteractionCount;              // A count of the number of work units which have interactions
-    unsigned int    nonbond_workBlock;              // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
-    unsigned int    bornForce2_workBlock;           // Number of work units running second half of Born Forces calculation
-    unsigned int    workUnitsPerSM;                 // Number of workblocks per SM
-    unsigned int    nbWorkUnitsPerBlock;            // Number of work units assigned to each nonbond block
-    unsigned int    nbWorkUnitsPerBlockRemainder;   // Remainder of work units to assign across lower numbered nonbond blocks
-    unsigned int    bf2WorkUnitsPerBlock;           // Number of work units assigned to each bornForce2 block
-    unsigned int    bf2WorkUnitsPerBlockRemainder;  // Remainder of work units to assign across lower numbered bornForce2 blocks
-
-
-    unsigned int    stride;                         // Atomic attributes stride
-    unsigned int    stride2;                        // Atomic attributes stride x 2
-    unsigned int    stride3;                        // Atomic attributes stride x 3
-    unsigned int    stride4;                        // Atomic attributes stride x 4
-    unsigned int    nonbondOutputBuffers;           // Nonbond output buffers per nonbond call
-    unsigned int    outputBuffers;                  // Number of output buffers
-    unsigned int    energyOutputBuffers;            // Number of energy output buffers
-    float           bigFloat;                       // Floating point value used as a flag for Shaken atoms 
-    float           epsfac;                         // Epsilon factor for CDLJ calculations
-    CudaNonbondedMethod nonbondedMethod;            // How to handle nonbonded interactions
-    CudaNonbondedMethod customNonbondedMethod;      // How to handle custom nonbonded interactions
-    float           nonbondedCutoff;                // Cutoff distance for nonbonded interactions
-    float           nonbondedCutoffSqr;             // Square of the cutoff distance for nonbonded interactions
-    float           periodicBoxSizeX;               // The X dimension of the periodic box
-    float           periodicBoxSizeY;               // The Y dimension of the periodic box
-    float           periodicBoxSizeZ;               // The Z dimension of the periodic box
-    float           invPeriodicBoxSizeX;            // The 1 over the X dimension of the periodic box
-    float           invPeriodicBoxSizeY;            // The 1 over the Y dimension of the periodic box
-    float           invPeriodicBoxSizeZ;            // The 1 over the Z dimension of the periodic box
-    float           recipBoxSizeX;                  // The X dimension of the reciprocal box for Ewald summation
-    float           recipBoxSizeY;                  // The Y dimension of the reciprocal box for Ewald summation
-    float           recipBoxSizeZ;                  // The Z dimension of the reciprocal box for Ewald summation
-    float           cellVolume;                     // Ewald parameter alpha (a.k.a. kappa)
-    float           alphaEwald;                     // Ewald parameter alpha (a.k.a. kappa)
-    float           factorEwald;                    // - 1 ( 4 * alphaEwald * alphaEwald)
-    int             kmaxX;                          // Maximum number of reciprocal vectors in the X direction
-    int             kmaxY;                          // Maximum number of reciprocal vectors in the Y direction
-    int             kmaxZ;                          // Maximum number of reciprocal vectors in the Z direction
-    float           reactionFieldK;                 // Constant for reaction field correction
-    float           reactionFieldC;                 // Constant for reaction field correction
-    float           probeRadius;                    // SASA probe radius
-    float           surfaceAreaFactor;              // ACE approximation surface area factor
-    float           electricConstant;               // ACE approximation electric constant
-    float           forceConversionFactor;          // kJ to kcal force conversion factor
-    float           preFactor;                      // Born electrostatic pre-factor
-    float           dielectricOffset;               // Born dielectric offset
-    float           alphaOBC;                       // OBC alpha factor
-    float           betaOBC;                        // OBC beta factor
-    float           gammaOBC;                       // OBC gamma factor
-    float           deltaT;                         // Molecular dynamics deltaT constant
-    float           oneOverDeltaT;                  // 1/deltaT
-    float           T;                              // Temperature
-    float           kT;                             // Boltzmann's constant times T
-    float           noiseAmplitude;                 // The magnitude of the noise for Brownian dynamics
-    float           tau;                            // Inverse friction for Langevin or Brownian dynamics
-    float           tauDeltaT;                      // tau*deltaT
-    float           collisionFrequency;             // Collision frequency for Andersen thermostat
-    float2*         pObcData;                       // Pointer to fixed Born data
-    int             gbviBornRadiusScalingMethod;    // scaling method for GB/VI Born radii
-    float           gbviQuinticLowerLimitFactor;    // Lower limit factor for scaing of GB/VI Born radii using quintic spline
-    float           gbviQuinticUpperBornRadiusLimit;// Upper limit for GB/VI Born radii 
-    float4*         pGBVIData;                      // Pointer to fixed Born data for GB/VI algorithm
-    float*          pGBVISwitchDerivative;          // Pointer to GB/VI Born switch derivatives
-    float2*         pAttr;                          // Pointer to additional atom attributes (sig, eps)
-    float4*         pCustomParams;                  // Pointer to atom parameters for custom nonbonded force
-    unsigned int    customExceptions;               // Number of custom nonbonded exceptions
-    unsigned int    customParameters;               // Number of parameters for custom nonbonded interactions
-    int4*           pCustomBondID;                  // Atom indices for custom bonds
-    float4*         pCustomBondParams;              // Parameters for custom bonds
-    unsigned int    customBonds;                    // Number of custom bonds
-    unsigned int    customBondParameters;           // Number of parameters for custom bonds
-    int4*           pCustomAngleID1;                // Atom indices for custom angles
-    int2*           pCustomAngleID2;                // Atom indices for custom angles
-    float4*         pCustomAngleParams;             // Parameters for custom angles
-    unsigned int    customAngles;                   // Number of custom angles
-    unsigned int    customAngleParameters;          // Number of parameters for custom angles
-    int4*           pCustomTorsionID1;              // Atom indices for custom torsions
-    int4*           pCustomTorsionID2;              // Atom indices for custom torsions
-    float4*         pCustomTorsionParams;           // Parameters for custom torsions
-    unsigned int    customTorsions;                 // Number of custom torsions
-    unsigned int    customTorsionParameters;        // Number of parameters for custom torsions
-    int*            pCustomExternalID;              // Atom indices for custom external force
-    float4*         pCustomExternalParams;          // Parameters for custom external force
-    unsigned int    customExternals;                // Number of particles for custom external force
-    unsigned int    customExternalParameters;       // Number of parameters for custom external force
-    float4*         pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function
-    float4*         pTabulatedFunctionParams;       // The min, max, and spacing for each tabulated function
-    float2*         pEwaldCosSinSum;                // Pointer to the cos/sin sums (ewald)
-    float*          pTabulatedErfc;                 // Tabulated values for erfc()
-    int             tabulatedErfcSize;              // The number of tabulated values for erfc()
-    float           tabulatedErfcScale;             // Scale factor for the argument to erfc()
-    int3            pmeGridSize;                    // The dimensions of the grid for particle mesh Ewald
-    int3            pmeGroupSize;                   // The dimensions of the groups used in charge spreading for PME
-    cufftComplex*   pPmeGrid;                       // Grid points for particle mesh Ewald
-    float*          pPmeBsplineModuli[3];
-    float4*         pPmeBsplineTheta;
-    float4*         pPmeBsplineDtheta;
-    int*            pPmeAtomRange;                  // The range of sorted atoms at each grid point
-    int2*           pPmeAtomGridIndex;              // The grid point each atom is at
-    unsigned int    bonds;                          // Number of bonds
-    int4*           pBondID;                        // Bond atom and output buffer IDs
-    float2*         pBondParameter;                 // Bond parameters
-    unsigned int    bond_angles;                    // Number of bond angles
-    int4*           pBondAngleID1;                  // Bond angle atom and first output buffer IDs
-    int2*           pBondAngleID2;                  // Bond angle output buffer IDs
-    float2*         pBondAngleParameter;            // Bond angle parameters
-    unsigned int    dihedrals;                      // Number of dihedrals
-    int4*           pDihedralID1;                   // Dihedral IDs
-    int4*           pDihedralID2;                   // Dihedral output buffer IDs
-    float4*         pDihedralParameter;             // Dihedral parameters
-    unsigned int    rb_dihedrals;                   // Number of Ryckaert Bellemans dihedrals
-    int4*           pRbDihedralID1;                 // Ryckaert Bellemans Dihedral IDs
-    int4*           pRbDihedralID2;                 // Ryckaert Bellemans Dihedral output buffer IDs
-    float4*         pRbDihedralParameter1;          // Ryckaert Bellemans Dihedral parameters
-    float2*         pRbDihedralParameter2;          // Ryckaert Bellemans Dihedral parameters
-    unsigned int    LJ14s;                          // Number of Lennard Jones 1-4 interactions
-    int4*           pLJ14ID;                        // Lennard Jones 1-4 atom and output buffer IDs
-    float4*         pLJ14Parameter;                 // Lennard Jones 1-4 parameters
-    float           inverseTotalMass;               // Used in linear momentum removal
-    unsigned int    ShakeConstraints;               // Total number of Shake constraints
-    unsigned int    settleConstraints;              // Total number of Settle constraints
-    unsigned int    ccmaConstraints;                // Total number of CCMA constraints.
-    unsigned int    rigidClusters;                  // Total number of rigid clusters
-    unsigned int    maxRigidClusterSize;            // The size of the largest rigid cluster
-    unsigned int    clusterShakeBlockSize;          // The number of threads to process each rigid cluster
-    unsigned int    maxShakeIterations;             // Maximum shake iterations
-    unsigned int    degreesOfFreedom;               // Number of degrees of freedom in system
-    float           shakeTolerance;                 // Shake tolerance
-    float           InvMassJ;                       // Shake inverse mass for hydrogens
-    int*            pNonShakeID;                    // Not Shaking atoms
-    int4*           pShakeID;                       // Shake atoms and phase
-    float4*         pShakeParameter;                // Shake parameters
-    int4*           pSettleID;                      // Settle atoms
-    float2*         pSettleParameter;               // Settle parameters
-    unsigned int*   pExclusion;                     // Nonbond exclusion data
-    unsigned int*   pExclusionIndex;                // Index of exclusion data for each work unit
-    unsigned int    bond_offset;                    // Offset to end of bonds
-    unsigned int    bond_angle_offset;              // Offset to end of bond angles
-    unsigned int    dihedral_offset;                // Offset to end of dihedrals
-    unsigned int    rb_dihedral_offset;             // Offset to end of Ryckaert Bellemans dihedrals
-    unsigned int    LJ14_offset;                    // Offset to end of Lennard Jones 1-4 parameters
-    int*            pAtomIndex;                     // The original index of each atom
-    float4*         pGridBoundingBox;               // The size of each grid cell
-    float4*         pGridCenter;                    // The center of each grid cell
-    int2*           pCcmaAtoms;                     // The atoms connected by each CCMA constraint
-    float4*         pCcmaDistance;                  // The displacement vector (x, y, z) and constraint distance (w) for each CCMA constraint
-    float*          pCcmaDelta1;                    // Workspace for CCMA
-    float*          pCcmaDelta2;                    // Workspace for CCMA
-    int*            pCcmaAtomConstraints;           // The indices of constraints involving each atom
-    int*            pCcmaNumAtomConstraints;        // The number of constraints involving each atom
-    int*            ccmaConvergedDeviceMarker;      // Device memory used to communicate that CCMA has converged
-    float*          pCcmaReducedMass;               // The reduced mass for each CCMA constraint
-    unsigned int*   pConstraintMatrixColumn;        // The column of each element in the constraint matrix.
-    float*          pConstraintMatrixValue;         // The value of each element in the constraint matrix.
-
-    // Mutable stuff
-    float4*         pPosq;                          // Pointer to atom positions and charges
-    float4*         pPosqP;                         // Pointer to mid-integration atom positions
-    float4*         pOldPosq;                       // Pointer to old atom positions
-    float4*         pVelm4;                         // Pointer to atom velocity and inverse mass
-    float4*         pForce4;                        // Pointer to force data
-    float*          pEnergy;                        // Pointer to energy output buffer
-    float*          pBornForce;                     // Pointer to Born force data
-    float*	    pBornSum;                       // Pointer to Born Radii calculation output buffers
-    float*	    pBornRadii;                     // Pointer to Born Radii
-    float*          pObcChain;                      // Pointer to OBC chain data
-    float4*         pLinearMomentum;                // Pointer to linear momentum
-    
-    // Random numbers
-    float4*         pRandom4;                       // Pointer to 4 random numbers
-    float2*         pRandom2;                       // Pointer to 2 random numbers
-    uint4*          pRandomSeed;                    // Pointer to random seeds
-    int*            pRandomPosition;                // Pointer to random number positions
-    unsigned int    randoms;                        // Number of randoms
-    unsigned int    totalRandoms;                   // Number of randoms plus overflow.
-    unsigned int    randomIterations;               // Number of iterations before regenerating randoms
-    unsigned int    randomFrames;                   // Number of frames of random numbers
-};
-
-struct Vectors {
-    float3 v0;
-    float3 v1;
-    float3 v2;
-};
-
-#endif
--- a/platforms/cuda-old/src/kernels/gpu.cpp
+++ b/platforms/cuda-old/src/kernels/gpu.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <string.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <cmath>
-#include <map>
-#include <set>
-#include <algorithm>
-#ifdef WIN32
-  #define _USE_MATH_DEFINES /* M_PI */
-  #include <math.h>
-  #include <windows.h>
-#else
-  #include <stdint.h>
-#endif
-using namespace std;
-
-#include "gputypes.h"
-#include "cudaKernels.h"
-#include "hilbert.h"
-#include "openmm/OpenMMException.h"
-#include "openmm/internal/SplineFitter.h"
-#include "quern.h"
-#include "Lepton.h"
-#include "rng.h"
-#include "../CudaForceInfo.h"
-
-// In case we're using some primitive version of Visual Studio this will
-// make sure that erf() and erfc() are defined.
-#include "openmm/internal/MSVC_erfc.h"
-#include "openmm/internal/windowsExport.h"
-
-using OpenMM::OpenMMException;
-using Lepton::Operation;
-
-struct ShakeCluster {
-    int centralID;
-    int peripheralID[3];
-    int size;
-    bool valid;
-    float distance;
-    float centralInvMass, peripheralInvMass;
-    ShakeCluster() : valid(true) {
-    }
-    ShakeCluster(int centralID, float invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
-    }
-    void addAtom(int id, float dist, float invMass) {
-        if (size == 3 || (size > 0 && dist != distance) || (size > 0 && invMass != peripheralInvMass))
-            valid = false;
-        else {
-            peripheralID[size++] = id;
-            distance = dist;
-            peripheralInvMass = invMass;
-        }
-    }
-};
-
-struct Constraint
-{
-    Constraint(int atom1, int atom2, float distance2) : atom1(atom1), atom2(atom2), distance2(distance2) {
-    }
-    int atom1, atom2;
-    float distance2;
-};
-
-struct ConstraintOrderer : public binary_function<int, int, bool> {
-    const vector<int>& atom1;
-    const vector<int>& atom2;
-    const vector<int>& constraints;
-    ConstraintOrderer(const vector<int>& atom1, const vector<int>& atom2, const vector<int>& constraints) : atom1(atom1), atom2(atom2), constraints(constraints) {
-    }
-    bool operator()(int x, int y) {
-        int ix = constraints[x];
-        int iy = constraints[y];
-        if (atom1[ix] != atom1[iy])
-            return atom1[ix] < atom1[iy];
-        return atom2[ix] < atom2[iy];
-    }
-};
-
-struct Molecule {
-    vector<int> atoms;
-    vector<int> constraints;
-    vector<vector<int> > groups;
-};
-
-static const float dielectricOffset         =    0.009f;
-static const float probeRadius              =    0.14f;
-static const float forceConversionFactor    =    0.4184f;
-
-//static const float surfaceAreaFactor        =   -6.0f * 0.06786f * forceConversionFactor * 1000.0f;  // PI * 4.0f * 0.0049f * 1000.0f;
-//static const float surfaceAreaFactor        =   -6.0f * PI * 4.0f * 0.0049f * 1000.0f;
-static const float surfaceAreaFactor        = -6.0f*PI*0.0216f*1000.0f*0.4184f;
-//static const float surfaceAreaFactor        = -1.7035573959e+001;
-//static const float surfaceAreaFactor        = -166.03185f;
-//static const float surfaceAreaFactor        = 1.0f;
-
-static const float alphaOBC                 =    1.0f;
-static const float betaOBC                  =    0.8f;
-static const float gammaOBC                 =    4.85f;
-static const float kcalMolTokJNM            =   -0.4184f;
-static const float electricConstant         = -166.03185f;
-static const float defaultInnerDielectric   =    1.0f;
-static const float defaultSolventDielectric =   78.3f;
-static const float KILO                     =    1e3;                      // Thousand
-static const float BOLTZMANN                =    1.380658e-23f;            // (J/K)    
-static const float AVOGADRO                 =    6.0221367e23f;            // ()        
-static const float RGAS                     =    BOLTZMANN * AVOGADRO;     // (J/(mol K))
-static const float BOLTZ                    =    (RGAS / KILO);            // (kJ/(mol K)) 
-
-#define DUMP_PARAMETERS 0
-
-template <int SIZE>
-static Expression<SIZE> createExpression(gpuContext gpu, const string& expression, const Lepton::ExpressionProgram& program, const vector<string>& variables,
-        const vector<string>& globalParamNames, unsigned int& maxStackSize) {
-    Expression<SIZE> exp;
-    if (program.getNumOperations() > SIZE)
-        throw OpenMMException("Expression contains too many operations: "+expression);
-    exp.length = program.getNumOperations();
-    exp.stackSize = program.getStackSize();
-    if (exp.stackSize > (int) maxStackSize)
-        maxStackSize = exp.stackSize;
-    for (int i = 0; i < program.getNumOperations(); i++) {
-        const Operation& op = program.getOperation(i);
-        switch (op.getId()) {
-            case Operation::CONSTANT:
-                exp.op[i] = CONSTANT;
-                exp.arg[i] = (float) dynamic_cast<const Operation::Constant*>(&op)->getValue();
-                break;
-            case Operation::VARIABLE:
-                if (variables.size() > 0 && op.getName() == variables[0])
-                    exp.op[i] = VARIABLE0;
-                else if (variables.size() > 1 && op.getName() == variables[1])
-                    exp.op[i] = VARIABLE1;
-                else if (variables.size() > 2 && op.getName() == variables[2])
-                    exp.op[i] = VARIABLE2;
-                else if (variables.size() > 3 && op.getName() == variables[3])
-                    exp.op[i] = VARIABLE3;
-                else if (variables.size() > 4 && op.getName() == variables[4])
-                    exp.op[i] = VARIABLE4;
-                else if (variables.size() > 5 && op.getName() == variables[5])
-                    exp.op[i] = VARIABLE5;
-                else if (variables.size() > 6 && op.getName() == variables[6])
-                    exp.op[i] = VARIABLE6;
-                else if (variables.size() > 7 && op.getName() == variables[7])
-                    exp.op[i] = VARIABLE7;
-                else if (variables.size() > 8 && op.getName() == variables[8])
-                    exp.op[i] = VARIABLE8;
-                else {
-                    int j;
-                    for (j = 0; j < (int) globalParamNames.size() && op.getName() != globalParamNames[j]; j++);
-                    if (j == globalParamNames.size())
-                        throw OpenMMException("Unknown variable '"+op.getName()+"' in expression: "+expression);
-                    exp.op[i] = GLOBAL;
-                    exp.arg[i] = (float) j;
-                }
-                break;
-            case Operation::CUSTOM:
-                exp.op[i] = dynamic_cast<const Operation::Custom*>(&op)->getDerivOrder()[0] == 0 ? CUSTOM : CUSTOM_DERIV;
-                for (int j = 0; j < MAX_TABULATED_FUNCTIONS; j++)
-                    if (op.getName() == gpu->tabulatedFunctions[j].name) {
-                        exp.arg[i] = (float) j;
-                        break;
-                    }
-                break;
-            case Operation::ADD:
-                exp.op[i] = ADD;
-                break;
-            case Operation::SUBTRACT:
-                exp.op[i] = SUBTRACT;
-                break;
-            case Operation::MULTIPLY:
-                exp.op[i] = MULTIPLY;
-                break;
-            case Operation::DIVIDE:
-                exp.op[i] = DIVIDE;
-                break;
-            case Operation::POWER:
-                exp.op[i] = POWER;
-                break;
-            case Operation::NEGATE:
-                exp.op[i] = NEGATE;
-                break;
-            case Operation::SQRT:
-                exp.op[i] = SQRT;
-                break;
-            case Operation::EXP:
-                exp.op[i] = EXP;
-                break;
-            case Operation::LOG:
-                exp.op[i] = LOG;
-                break;
-            case Operation::SIN:
-                exp.op[i] = SIN;
-                break;
-            case Operation::COS:
-                exp.op[i] = COS;
-                break;
-            case Operation::SEC:
-                exp.op[i] = SEC;
-                break;
-            case Operation::CSC:
-                exp.op[i] = CSC;
-                break;
-            case Operation::TAN:
-                exp.op[i] = TAN;
-                break;
-            case Operation::COT:
-                exp.op[i] = COT;
-                break;
-            case Operation::ASIN:
-                exp.op[i] = ASIN;
-                break;
-            case Operation::ACOS:
-                exp.op[i] = ACOS;
-                break;
-            case Operation::ATAN:
-                exp.op[i] = ATAN;
-                break;
-            case Operation::SINH:
-                exp.op[i] = SINH;
-                break;
-            case Operation::COSH:
-                exp.op[i] = COSH;
-                break;
-            case Operation::TANH:
-                exp.op[i] = TANH;
-                break;
-            case Operation::ERF:
-                exp.op[i] = ERF;
-                break;
-            case Operation::ERFC:
-                exp.op[i] = ERFC;
-                break;
-            case Operation::STEP:
-                exp.op[i] = STEP;
-                break;
-            case Operation::SQUARE:
-                exp.op[i] = SQUARE;
-                break;
-            case Operation::CUBE:
-                exp.op[i] = CUBE;
-                break;
-            case Operation::RECIPROCAL:
-                exp.op[i] = RECIPROCAL;
-                break;
-            case Operation::ADD_CONSTANT:
-                exp.op[i] = ADD_CONSTANT;
-                exp.arg[i] = (float) dynamic_cast<const Operation::AddConstant*>(&op)->getValue();
-                break;
-            case Operation::MULTIPLY_CONSTANT:
-                exp.op[i] = MULTIPLY_CONSTANT;
-                exp.arg[i] = (float) dynamic_cast<const Operation::MultiplyConstant*>(&op)->getValue();
-                break;
-            case Operation::POWER_CONSTANT:
-                exp.op[i] = POWER_CONSTANT;
-                exp.arg[i] = (float) dynamic_cast<const Operation::PowerConstant*>(&op)->getValue();
-                break;
-            case Operation::MIN:
-                exp.op[i] = MIN;
-                break;
-            case Operation::MAX:
-                exp.op[i] = MAX;
-                break;
-            case Operation::ABS:
-                exp.op[i] = ABS;
-                break;
-        }
-    }
-    return exp;
-}
-
-extern "C"
-void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
-{
-    int bonds = atom1.size();
-    gpu->sim.bonds                              = bonds;
-    CUDAStream<int4>* psBondID                  = new CUDAStream<int4>(bonds, 1, "BondID");
-    gpu->psBondID                               = psBondID;
-    gpu->sim.pBondID                            = psBondID->_pDevStream[0];
-    CUDAStream<float2>* psBondParameter         = new CUDAStream<float2>(bonds, 1, "BondParameter");
-    gpu->psBondParameter                        = psBondParameter;
-    gpu->sim.pBondParameter                     = psBondParameter->_pDevStream[0];
-    for (int i = 0; i < bonds; i++)
-    {
-        (*psBondID)[i].x = atom1[i];
-        (*psBondID)[i].y = atom2[i];
-        (*psBondParameter)[i].x = length[i];
-        (*psBondParameter)[i].y = k[i];
-        psBondID->_pSysData[i].z = gpu->pOutputBufferCounter[psBondID->_pSysData[i].x]++;
-        psBondID->_pSysData[i].w = gpu->pOutputBufferCounter[psBondID->_pSysData[i].y]++;
-#if (DUMP_PARAMETERS == 1)                
-        cout << 
-            i << " " << 
-            (*psBondID)[i].x << " " <<
-            (*psBondID)[i].y << " " <<
-            (*psBondID)[i].z << " " <<
-            (*psBondID)[i].w << " " <<
-            (*psBondParameter)[i].x << " " <<
-            (*psBondParameter)[i].y <<
-            endl;
-#endif
-    }
-    psBondID->Upload();
-    psBondParameter->Upload();
-}
-
-extern "C"
-void gpuSetBondAngleParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3,
-        const vector<float>& angle, const vector<float>& k)
-{
-    int bond_angles = atom1.size();
-    gpu->sim.bond_angles                        = bond_angles;
-    CUDAStream<int4>* psBondAngleID1            = new CUDAStream<int4>(bond_angles, 1, "BondAngleID1");
-    gpu->psBondAngleID1                         = psBondAngleID1;
-    gpu->sim.pBondAngleID1                      = psBondAngleID1->_pDevStream[0];
-    CUDAStream<int2>* psBondAngleID2            = new CUDAStream<int2>(bond_angles, 1, "BondAngleID2");
-    gpu->psBondAngleID2                         = psBondAngleID2;
-    gpu->sim.pBondAngleID2                      = psBondAngleID2->_pDevStream[0];
-    CUDAStream<float2>* psBondAngleParameter    = new CUDAStream<float2>(bond_angles, 1, "BondAngleParameter");
-    gpu->psBondAngleParameter                   = psBondAngleParameter;
-    gpu->sim.pBondAngleParameter                = psBondAngleParameter->_pDevStream[0];        
-
-    for (int i = 0; i < bond_angles; i++)
-    {
-        (*psBondAngleID1)[i].x = atom1[i];
-        (*psBondAngleID1)[i].y = atom2[i];
-        (*psBondAngleID1)[i].z = atom3[i];
-        (*psBondAngleParameter)[i].x = angle[i];
-        (*psBondAngleParameter)[i].y = k[i];
-        psBondAngleID1->_pSysData[i].w = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].x]++;
-        psBondAngleID2->_pSysData[i].x = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].y]++;
-        psBondAngleID2->_pSysData[i].y = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].z]++;
-#if (DUMP_PARAMETERS == 1)
-         cout << 
-            i << " " << 
-            (*psBondAngleID1)[i].x << " " <<
-            (*psBondAngleID1)[i].y << " " <<
-            (*psBondAngleID1)[i].z << " " <<
-            (*psBondAngleID1)[i].w << " " <<
-            (*psBondAngleID2)[i].x << " " <<
-            (*psBondAngleID2)[i].y << " " <<
-            (*psBondAngleParameter)[i].x << " " <<
-            (*psBondAngleParameter)[i].y <<
-            endl;
-#endif
-    }
-    psBondAngleID1->Upload();
-    psBondAngleID2->Upload();
-    psBondAngleParameter->Upload();
-}
-
-extern "C"
-void gpuSetDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
-        const vector<float>& k, const vector<float>& phase, const vector<int>& periodicity)
-{
-        int dihedrals = atom1.size();
-        gpu->sim.dihedrals = dihedrals;
-        CUDAStream<int4>* psDihedralID1             = new CUDAStream<int4>(dihedrals, 1, "DihedralID1");
-        gpu->psDihedralID1                          = psDihedralID1;
-        gpu->sim.pDihedralID1                       = psDihedralID1->_pDevStream[0];
-        CUDAStream<int4>* psDihedralID2             = new CUDAStream<int4>(dihedrals, 1, "DihedralID2");
-        gpu->psDihedralID2                          = psDihedralID2;
-        gpu->sim.pDihedralID2                       = psDihedralID2->_pDevStream[0];
-        CUDAStream<float4>* psDihedralParameter     = new CUDAStream<float4>(dihedrals, 1, "DihedralParameter");
-        gpu->psDihedralParameter                    = psDihedralParameter;
-        gpu->sim.pDihedralParameter                 = psDihedralParameter->_pDevStream[0];
-        for (int i = 0; i < dihedrals; i++)
-        {
-            (*psDihedralID1)[i].x = atom1[i];
-            (*psDihedralID1)[i].y = atom2[i];
-            (*psDihedralID1)[i].z = atom3[i];
-            (*psDihedralID1)[i].w = atom4[i];
-            (*psDihedralParameter)[i].x = k[i];
-            (*psDihedralParameter)[i].y = phase[i];
-            (*psDihedralParameter)[i].z = (float) periodicity[i];
-            psDihedralID2->_pSysData[i].x = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].x]++;
-            psDihedralID2->_pSysData[i].y = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].y]++;
-            psDihedralID2->_pSysData[i].z = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].z]++;
-            psDihedralID2->_pSysData[i].w = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].w]++;
-#if (DUMP_PARAMETERS == 1)
-            cout << 
-                i << " " << 
-                (*psDihedralID1)[i].x << " " <<
-                (*psDihedralID1)[i].y << " " <<
-                (*psDihedralID1)[i].z << " " <<
-                (*psDihedralID1)[i].w << " " <<
-                (*psDihedralID2)[i].x << " " <<
-                (*psDihedralID2)[i].y << " " <<
-                (*psDihedralID2)[i].z << " " <<
-                (*psDihedralID2)[i].w << " " <<
-                (*psDihedralParameter)[i].x << " " <<
-                (*psDihedralParameter)[i].y << " " <<
-                (*psDihedralParameter)[i].z << endl;
-#endif
-        }
-        psDihedralID1->Upload();
-        psDihedralID2->Upload();
-        psDihedralParameter->Upload();
-}
-
-extern "C"
-void gpuSetRbDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
-        const vector<float>& c0, const vector<float>& c1, const vector<float>& c2, const vector<float>& c3, const vector<float>& c4, const vector<float>& c5)
-{
-    int rb_dihedrals = atom1.size();
-    gpu->sim.rb_dihedrals = rb_dihedrals;
-    CUDAStream<int4>* psRbDihedralID1           = new CUDAStream<int4>(rb_dihedrals, 1, "RbDihedralID1");
-    gpu->psRbDihedralID1                        = psRbDihedralID1;
-    gpu->sim.pRbDihedralID1                     = psRbDihedralID1->_pDevStream[0];
-    CUDAStream<int4>* psRbDihedralID2           = new CUDAStream<int4>(rb_dihedrals, 1, "RbDihedralID2");
-    gpu->psRbDihedralID2                        = psRbDihedralID2;
-    gpu->sim.pRbDihedralID2                     = psRbDihedralID2->_pDevStream[0];
-    CUDAStream<float4>* psRbDihedralParameter1  = new CUDAStream<float4>(rb_dihedrals, 1, "RbDihedralParameter1");
-    gpu->psRbDihedralParameter1                 = psRbDihedralParameter1;
-    gpu->sim.pRbDihedralParameter1              = psRbDihedralParameter1->_pDevStream[0];
-    CUDAStream<float2>* psRbDihedralParameter2  = new CUDAStream<float2>(rb_dihedrals, 1, "RbDihedralParameter2");
-    gpu->psRbDihedralParameter2                 = psRbDihedralParameter2;
-    gpu->sim.pRbDihedralParameter2              = psRbDihedralParameter2->_pDevStream[0];
-
-    for (int i = 0; i < rb_dihedrals; i++)
-    {
-        (*psRbDihedralID1)[i].x = atom1[i];
-        (*psRbDihedralID1)[i].y = atom2[i];
-        (*psRbDihedralID1)[i].z = atom3[i];
-        (*psRbDihedralID1)[i].w = atom4[i];
-        (*psRbDihedralParameter1)[i].x = c0[i];
-        (*psRbDihedralParameter1)[i].y = c1[i];
-        (*psRbDihedralParameter1)[i].z = c2[i];
-        (*psRbDihedralParameter1)[i].w = c3[i];
-        (*psRbDihedralParameter2)[i].x = c4[i];
-        (*psRbDihedralParameter2)[i].y = c5[i];
-        psRbDihedralID2->_pSysData[i].x = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].x]++;
-        psRbDihedralID2->_pSysData[i].y = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].y]++;
-        psRbDihedralID2->_pSysData[i].z = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].z]++;
-        psRbDihedralID2->_pSysData[i].w = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].w]++;
-#if (DUMP_PARAMETERS == 1)
-        cout << 
-            i << " " << 
-            (*psRbDihedralID1)[i].x << " " <<
-            (*psRbDihedralID1)[i].y << " " <<
-            (*psRbDihedralID1)[i].z << " " <<
-            (*psRbDihedralID1)[i].w <<" " <<
-            (*psRbDihedralID2)[i].x << " " <<
-            (*psRbDihedralID2)[i].y << " " <<
-            (*psRbDihedralID2)[i].z << " " <<
-            (*psRbDihedralID2)[i].w <<" " <<
-            (*psRbDihedralParameter1)[i].x << " " <<
-            (*psRbDihedralParameter1)[i].y << " " <<
-            (*psRbDihedralParameter1)[i].z << " " <<
-            (*psRbDihedralParameter1)[i].w << " " <<
-            (*psRbDihedralParameter2)[i].x << " " <<
-            (*psRbDihedralParameter2)[i].y <<
-            endl;
-#endif
-    }
-    psRbDihedralID1->Upload();
-    psRbDihedralID2->Upload();
-    psRbDihedralParameter1->Upload();
-    psRbDihedralParameter2->Upload();
-}
-
-extern "C"
-void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vector<int>& atom1, const vector<int>& atom2,
-        const vector<float>& c6, const vector<float>& c12, const vector<float>& q1, const vector<float>& q2)
-{
-    int LJ14s = atom1.size();
-    float scale = epsfac * fudge;
-
-    gpu->sim.LJ14s                              = LJ14s;
-    CUDAStream<int4>* psLJ14ID                  = new CUDAStream<int4>(LJ14s, 1, "LJ14ID");
-    gpu->psLJ14ID                               = psLJ14ID;
-    gpu->sim.pLJ14ID                            = psLJ14ID->_pDevStream[0];
-    CUDAStream<float4>* psLJ14Parameter         = new CUDAStream<float4>(LJ14s, 1, "LJ14Parameter");
-    gpu->psLJ14Parameter                        = psLJ14Parameter;
-    gpu->sim.pLJ14Parameter                     = psLJ14Parameter->_pDevStream[0];
-
-    for (int i = 0; i < LJ14s; i++)
-    {
-        (*psLJ14ID)[i].x = atom1[i];
-        (*psLJ14ID)[i].y = atom2[i];
-        psLJ14ID->_pSysData[i].z = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].x]++;
-        psLJ14ID->_pSysData[i].w = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].y]++;
-        float p0, p1, p2;
-        if (c12[i] == 0.0f)
-        {
-            p0 = 0.0f;
-            p1 = 1.0f;
-        }
-        else
-        {
-            p0 = c6[i] * c6[i] / c12[i];
-            p1 = pow(c12[i] / c6[i], 1.0f / 6.0f);
-        }
-        p2 = scale * q1[i] * q2[i];
-        (*psLJ14Parameter)[i].x = p0;
-        (*psLJ14Parameter)[i].y = p1;
-        (*psLJ14Parameter)[i].z = p2;
-    }
-#if (DUMP_PARAMETERS == 1)
-        cout << 
-            i << " " <<
-            (*psLJ14ID)[i].x << " " <<
-            (*psLJ14ID)[i].y << " " <<
-            (*psLJ14ID)[i].z << " " <<
-            (*psLJ14ID)[i].w << " " <<
-            (*psLJ14Parameter)[i].x << " " <<
-            (*psLJ14Parameter)[i].y << " " <<
-            (*psLJ14Parameter)[i].z << " " <<
-            p0 << " " << 
-            p1 << " " << 
-            p2 << " " << 
-            endl;
-#endif
-    psLJ14ID->Upload();
-    psLJ14Parameter->Upload();
-}
-
-extern "C" void OPENMMCUDA_EXPORT setExclusions(gpuContext gpu, const vector<vector<int> >& exclusions) {
-    if (gpu->exclusions.size() > 0) {
-        bool ok = (exclusions.size() == gpu->exclusions.size());
-        for (int i = 0; i < (int) exclusions.size() && ok; i++) {
-            if (exclusions[i].size() != gpu->exclusions[i].size())
-                ok = false;
-            else {
-                for (int j = 0; j < (int) exclusions[i].size(); j++)
-                    if (find(gpu->exclusions[i].begin(), gpu->exclusions[i].end(), exclusions[i][j]) == gpu->exclusions[i].end())
-                        ok = false;
-            }
-        }
-        if (!ok)
-            throw OpenMMException("All nonbonded forces must have identical sets of exceptions");
-    }
-    gpu->exclusions = exclusions;
-}
-
-extern "C"
-void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
-        const vector<char>& symbol, const vector<vector<int> >& exclusions, CudaNonbondedMethod method)
-{
-    unsigned int coulombs = c6.size();
-    gpu->sim.epsfac = epsfac;
-    gpu->sim.nonbondedMethod = method;
-    if (coulombs > 0)
-        setExclusions(gpu, exclusions);
-    
-    for (unsigned int i = 0; i < coulombs; i++)
-    {
-            float p0 = q[i];
-            float p1 = 0.5f, p2 = 0.0f;               
-            if ((c6[i] > 0.0f) && (c12[i] > 0.0f))
-            {
-                p1 = 0.5f * pow(c12[i] / c6[i], 1.0f / 6.0f);
-                p2 = c6[i] * sqrt(1.0f / c12[i]);
-            }
-            if (symbol.size() > 0)
-                gpu->pAtomSymbol[i] = symbol[i];
-            (*gpu->psPosq4)[i].w = p0;
-            (*gpu->psSigEps2)[i].x = p1;
-            (*gpu->psSigEps2)[i].y = p2;
-    }
-
-    // Dummy out extra atom data
-
-    for (unsigned int i = gpu->natoms; i < gpu->sim.paddedNumberOfAtoms; i++)
-    {
-        (*gpu->psPosq4)[i].x       = 100000.0f + i * 10.0f;
-        (*gpu->psPosq4)[i].y       = 100000.0f + i * 10.0f;
-        (*gpu->psPosq4)[i].z       = 100000.0f + i * 10.0f;
-        (*gpu->psPosq4)[i].w       = 0.0f;
-        (*gpu->psSigEps2)[i].x     = 0.0f;
-        (*gpu->psSigEps2)[i].y     = 0.0f;
-    }
-    gpu->psPosq4->Upload();
-    gpu->psSigEps2->Upload();
-}
-
-extern "C"
-void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric)
-{
-    if (gpu->sim.nonbondedCutoff != 0.0f && gpu->sim.nonbondedCutoff != cutoffDistance)
-        throw OpenMMException("All nonbonded forces must use the same cutoff");
-    gpu->sim.nonbondedCutoff = cutoffDistance;
-    gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
-    gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
-    gpu->sim.reactionFieldC = (1.0f / cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
-}
-
-extern "C"
-void gpuSetTabulatedFunction(gpuContext gpu, int index, const string& name, const vector<double>& values, double min, double max)
-{
-    if (index < 0 || index >= MAX_TABULATED_FUNCTIONS) {
-        stringstream str;
-        str << "Only " << MAX_TABULATED_FUNCTIONS << " tabulated functions are supported";
-        throw OpenMMException(str.str());
-    }
-    if (gpu->tabulatedFunctions[index].coefficients != NULL)
-        delete gpu->tabulatedFunctions[index].coefficients;
-    CUDAStream<float4>* coeff = new CUDAStream<float4>((int) values.size()-1, 1, "TabulatedFunction");
-    gpu->tabulatedFunctions[index].coefficients = coeff;
-    gpu->sim.pTabulatedFunctionCoefficients[index] = coeff->_pDevData;
-    gpu->tabulatedFunctions[index].name = name;
-    gpu->tabulatedFunctions[index].min = min;
-    gpu->tabulatedFunctions[index].max = max;
-    gpu->tabulatedFunctionsChanged = true;
-
-    // Compute the spline coefficients.
-
-    int numValues = values.size();
-    vector<double> x(numValues), derivs;
-    for (int i = 0; i < numValues; i++)
-        x[i] = min+i*(max-min)/(numValues-1);
-    OpenMM::SplineFitter::createNaturalSpline(x, values, derivs);
-    for (int i = 0; i < (int) values.size()-1; i++)
-        (*coeff)[i] = make_float4((float) values[i], (float) values[i+1], (float) (derivs[i]/6.0), (float) (derivs[i+1]/6.0));
-    coeff->Upload();
-}
-
-extern "C"
-void gpuSetCustomBondParameters(gpuContext gpu, const vector<int>& bondAtom1, const vector<int>& bondAtom2, const vector<vector<double> >& bondParams,
-            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
-{
-    if (paramNames.size() > 4)
-        throw OpenMMException("CudaPlatform only supports four per-bond parameters for custom bond forces");
-    if (globalParamNames.size() > 8)
-        throw OpenMMException("CudaPlatform only supports eight global parameters for custom bond forces");
-    if (gpu->psCustomBondID != NULL)
-        throw OpenMMException("CudaPlatform only supports a single CustomBondForce per System");
-    gpu->sim.customBonds = bondAtom1.size();
-    gpu->sim.customBondParameters = paramNames.size();
-    gpu->psCustomBondID = new CUDAStream<int4>(gpu->sim.customBonds, 1, "CustomBondId");
-    gpu->sim.pCustomBondID = gpu->psCustomBondID->_pDevData;
-    gpu->psCustomBondParams = new CUDAStream<float4>(gpu->sim.customBonds, 1, "CustomBondParams");
-    gpu->sim.pCustomBondParams = gpu->psCustomBondParams->_pDevData;
-    vector<int> forceBufferCounter(gpu->natoms, 0);
-    for (int i = 0; i < (int) bondAtom1.size(); i++) {
-        (*gpu->psCustomBondID)[i].x = bondAtom1[i];
-        (*gpu->psCustomBondID)[i].y = bondAtom2[i];
-        (*gpu->psCustomBondID)[i].z = forceBufferCounter[bondAtom1[i]]++;
-        (*gpu->psCustomBondID)[i].w = forceBufferCounter[bondAtom2[i]]++;
-        if (bondParams[i].size() > 0)
-            (*gpu->psCustomBondParams)[i].x = (float) bondParams[i][0];
-        if (bondParams[i].size() > 1)
-            (*gpu->psCustomBondParams)[i].y = (float) bondParams[i][1];
-        if (bondParams[i].size() > 2)
-            (*gpu->psCustomBondParams)[i].z = (float) bondParams[i][2];
-        if (bondParams[i].size() > 3)
-            (*gpu->psCustomBondParams)[i].w = (float) bondParams[i][3];
-    }
-    gpu->psCustomBondID->Upload();
-    gpu->psCustomBondParams->Upload();
-    for (int i = 0; i < (int) forceBufferCounter.size(); i++)
-        if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
-            gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
-
-    // Create the Expressions.
-
-    vector<string> variables;
-    variables.push_back("r");
-    for (int i = 0; i < (int) paramNames.size(); i++)
-        variables.push_back(paramNames[i]);
-    SetCustomBondEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-    SetCustomBondForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("r").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-}
-
-extern "C"
-void gpuSetCustomAngleParameters(gpuContext gpu, const vector<int>& angleAtom1, const vector<int>& angleAtom2, const vector<int>& angleAtom3, const vector<vector<double> >& angleParams,
-            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
-{
-    if (paramNames.size() > 4)
-        throw OpenMMException("CudaPlatform only supports four per-angle parameters for custom angle forces");
-    if (globalParamNames.size() > 8)
-        throw OpenMMException("CudaPlatform only supports eight global parameters for custom angle forces");
-    if (gpu->psCustomAngleID1 != NULL)
-        throw OpenMMException("CudaPlatform only supports a single CustomAngleForce per System");
-    gpu->sim.customAngles = angleAtom1.size();
-    gpu->sim.customAngleParameters = paramNames.size();
-    gpu->psCustomAngleID1 = new CUDAStream<int4>(gpu->sim.customAngles, 1, "CustomAngleId1");
-    gpu->sim.pCustomAngleID1 = gpu->psCustomAngleID1->_pDevData;
-    gpu->psCustomAngleID2 = new CUDAStream<int2>(gpu->sim.customAngles, 1, "CustomAngleId2");
-    gpu->sim.pCustomAngleID2 = gpu->psCustomAngleID2->_pDevData;
-    gpu->psCustomAngleParams = new CUDAStream<float4>(gpu->sim.customAngles, 1, "CustomAngleParams");
-    gpu->sim.pCustomAngleParams = gpu->psCustomAngleParams->_pDevData;
-    vector<int> forceBufferCounter(gpu->natoms, 0);
-    for (int i = 0; i < (int) angleAtom1.size(); i++) {
-        (*gpu->psCustomAngleID1)[i].x = angleAtom1[i];
-        (*gpu->psCustomAngleID1)[i].y = angleAtom2[i];
-        (*gpu->psCustomAngleID1)[i].z = angleAtom3[i];
-        (*gpu->psCustomAngleID1)[i].w = forceBufferCounter[angleAtom1[i]]++;
-        (*gpu->psCustomAngleID2)[i].x = forceBufferCounter[angleAtom2[i]]++;
-        (*gpu->psCustomAngleID2)[i].y = forceBufferCounter[angleAtom3[i]]++;
-        if (angleParams[i].size() > 0)
-            (*gpu->psCustomAngleParams)[i].x = (float) angleParams[i][0];
-        if (angleParams[i].size() > 1)
-            (*gpu->psCustomAngleParams)[i].y = (float) angleParams[i][1];
-        if (angleParams[i].size() > 2)
-            (*gpu->psCustomAngleParams)[i].z = (float) angleParams[i][2];
-        if (angleParams[i].size() > 3)
-            (*gpu->psCustomAngleParams)[i].w = (float) angleParams[i][3];
-    }
-    gpu->psCustomAngleID1->Upload();
-    gpu->psCustomAngleID2->Upload();
-    gpu->psCustomAngleParams->Upload();
-    for (int i = 0; i < (int) forceBufferCounter.size(); i++)
-        if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
-            gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
-
-    // Create the Expressions.
-
-    vector<string> variables;
-    variables.push_back("theta");
-    for (int i = 0; i < (int) paramNames.size(); i++)
-        variables.push_back(paramNames[i]);
-    SetCustomAngleEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-    SetCustomAngleForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("theta").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-}
-
-extern "C"
-void gpuSetCustomTorsionParameters(gpuContext gpu, const vector<int>& torsionAtom1, const vector<int>& torsionAtom2, const vector<int>& torsionAtom3, const vector<int>& torsionAtom4, const vector<vector<double> >& torsionParams,
-            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
-{
-    if (paramNames.size() > 4)
-        throw OpenMMException("CudaPlatform only supports four per-torsion parameters for custom torsion forces");
-    if (globalParamNames.size() > 8)
-        throw OpenMMException("CudaPlatform only supports eight global parameters for custom torsion forces");
-    if (gpu->psCustomTorsionID1 != NULL)
-        throw OpenMMException("CudaPlatform only supports a single CustomTorsionForce per System");
-    gpu->sim.customTorsions = torsionAtom1.size();
-    gpu->sim.customTorsionParameters = paramNames.size();
-    gpu->psCustomTorsionID1 = new CUDAStream<int4>(gpu->sim.customTorsions, 1, "CustomTorsionId1");
-    gpu->sim.pCustomTorsionID1 = gpu->psCustomTorsionID1->_pDevData;
-    gpu->psCustomTorsionID2 = new CUDAStream<int4>(gpu->sim.customTorsions, 1, "CustomTorsionId2");
-    gpu->sim.pCustomTorsionID2 = gpu->psCustomTorsionID2->_pDevData;
-    gpu->psCustomTorsionParams = new CUDAStream<float4>(gpu->sim.customTorsions, 1, "CustomTorsionParams");
-    gpu->sim.pCustomTorsionParams = gpu->psCustomTorsionParams->_pDevData;
-    vector<int> forceBufferCounter(gpu->natoms, 0);
-    for (int i = 0; i < (int) torsionAtom1.size(); i++) {
-        (*gpu->psCustomTorsionID1)[i].x = torsionAtom1[i];
-        (*gpu->psCustomTorsionID1)[i].y = torsionAtom2[i];
-        (*gpu->psCustomTorsionID1)[i].z = torsionAtom3[i];
-        (*gpu->psCustomTorsionID1)[i].w = torsionAtom4[i];
-        (*gpu->psCustomTorsionID2)[i].x = forceBufferCounter[torsionAtom1[i]]++;
-        (*gpu->psCustomTorsionID2)[i].y = forceBufferCounter[torsionAtom2[i]]++;
-        (*gpu->psCustomTorsionID2)[i].z = forceBufferCounter[torsionAtom3[i]]++;
-        (*gpu->psCustomTorsionID2)[i].w = forceBufferCounter[torsionAtom4[i]]++;
-        if (torsionParams[i].size() > 0)
-            (*gpu->psCustomTorsionParams)[i].x = (float) torsionParams[i][0];
-        if (torsionParams[i].size() > 1)
-            (*gpu->psCustomTorsionParams)[i].y = (float) torsionParams[i][1];
-        if (torsionParams[i].size() > 2)
-            (*gpu->psCustomTorsionParams)[i].z = (float) torsionParams[i][2];
-        if (torsionParams[i].size() > 3)
-            (*gpu->psCustomTorsionParams)[i].w = (float) torsionParams[i][3];
-    }
-    gpu->psCustomTorsionID1->Upload();
-    gpu->psCustomTorsionID2->Upload();
-    gpu->psCustomTorsionParams->Upload();
-    for (int i = 0; i < (int) forceBufferCounter.size(); i++)
-        if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
-            gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
-
-    // Create the Expressions.
-
-    vector<string> variables;
-    variables.push_back("theta");
-    for (int i = 0; i < (int) paramNames.size(); i++)
-        variables.push_back(paramNames[i]);
-    SetCustomTorsionEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-    SetCustomTorsionForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("theta").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-}
-
-extern "C"
-void gpuSetCustomExternalParameters(gpuContext gpu, const vector<int>& atomIndex, const vector<vector<double> >& atomParams,
-            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
-{
-    if (paramNames.size() > 4)
-        throw OpenMMException("CudaPlatform only supports four per-particle parameters for custom external forces");
-    if (globalParamNames.size() > 8)
-        throw OpenMMException("CudaPlatform only supports eight global parameters for custom external forces");
-    if (gpu->psCustomExternalID != NULL)
-        throw OpenMMException("CudaPlatform only supports a single CustomExternalForce per System");
-    gpu->sim.customExternals = atomIndex.size();
-    gpu->sim.customExternalParameters = paramNames.size();
-    gpu->psCustomExternalID = new CUDAStream<int>(gpu->sim.customExternals, 1, "CustomExternalId");
-    gpu->sim.pCustomExternalID = gpu->psCustomExternalID->_pDevData;
-    gpu->psCustomExternalParams = new CUDAStream<float4>(gpu->sim.customExternals, 1, "CustomExternalParams");
-    gpu->sim.pCustomExternalParams = gpu->psCustomExternalParams->_pDevData;
-    for (int i = 0; i < (int) atomIndex.size(); i++) {
-        (*gpu->psCustomExternalID)[i] = atomIndex[i];
-        if (atomParams[i].size() > 0)
-            (*gpu->psCustomExternalParams)[i].x = (float) atomParams[i][0];
-        if (atomParams[i].size() > 1)
-            (*gpu->psCustomExternalParams)[i].y = (float) atomParams[i][1];
-        if (atomParams[i].size() > 2)
-            (*gpu->psCustomExternalParams)[i].z = (float) atomParams[i][2];
-        if (atomParams[i].size() > 3)
-            (*gpu->psCustomExternalParams)[i].w = (float) atomParams[i][3];
-    }
-    gpu->psCustomExternalID->Upload();
-    gpu->psCustomExternalParams->Upload();
-
-    // Create the Expressions.
-
-    vector<string> variables;
-    variables.push_back("x");
-    variables.push_back("y");
-    variables.push_back("z");
-    for (int i = 0; i < (int) paramNames.size(); i++)
-        variables.push_back(paramNames[i]);
-    SetCustomExternalEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-    SetCustomExternalForceExpressions(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("x").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize),
-                                  createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("y").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize),
-                                  createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("z").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-}
-
-extern "C"
-void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double> >& parameters, const vector<vector<int> >& exclusions,
-            CudaNonbondedMethod method, float cutoffDistance, const string& energyExp,
-            const vector<string>& paramNames, const vector<string>& globalParamNames)
-{
-    if (gpu->sim.nonbondedCutoff != 0.0f && gpu->sim.nonbondedCutoff != cutoffDistance)
-        throw OpenMMException("All nonbonded forces must use the same cutoff");
-    if (paramNames.size() > 4)
-        throw OpenMMException("CudaPlatform only supports four per-atom parameters for custom nonbonded forces");
-    if (globalParamNames.size() > 8)
-        throw OpenMMException("CudaPlatform only supports eight global parameters for custom nonbonded forces");
-    gpu->sim.nonbondedCutoff = cutoffDistance;
-    gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
-    gpu->sim.customNonbondedMethod = method;
-    gpu->sim.customParameters = paramNames.size();
-    setExclusions(gpu, exclusions);
-    gpu->psCustomParams = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "CustomParams");
-    gpu->sim.pCustomParams = gpu->psCustomParams->_pDevData;
-    for (int i = 0; i < (int) parameters.size(); i++) {
-        if (parameters[i].size() > 0)
-            (*gpu->psCustomParams)[i].x = (float) parameters[i][0];
-        if (parameters[i].size() > 1)
-            (*gpu->psCustomParams)[i].y = (float) parameters[i][1];
-        if (parameters[i].size() > 2)
-            (*gpu->psCustomParams)[i].z = (float) parameters[i][2];
-        if (parameters[i].size() > 3)
-            (*gpu->psCustomParams)[i].w = (float) parameters[i][3];
-    }
-    gpu->psCustomParams->Upload();
-
-    // This class serves as a placeholder for custom functions in expressions.
-
-    class FunctionPlaceholder : public Lepton::CustomFunction {
-    public:
-        int getNumArguments() const {
-            return 1;
-        }
-        double evaluate(const double* arguments) const {
-            return 0.0;
-        }
-        double evaluateDerivative(const double* arguments, const int* derivOrder) const {
-            return 0.0;
-        }
-        CustomFunction* clone() const {
-            return new FunctionPlaceholder();
-        }
-    };
-
-    // Record the tabulated functions, which were previously set with calls to gpuSetTabulatedFunction().
-
-    FunctionPlaceholder* fp = new FunctionPlaceholder();
-    map<string, Lepton::CustomFunction*> functions;
-    gpu->psTabulatedFunctionParams = new CUDAStream<float4>(MAX_TABULATED_FUNCTIONS, 1, "TabulatedFunctionRange");
-    gpu->sim.pTabulatedFunctionParams = gpu->psTabulatedFunctionParams->_pDevData;
-    for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++) {
-        gpuTabulatedFunction& func = gpu->tabulatedFunctions[i];
-        if (func.coefficients != NULL) {
-            (*gpu->psTabulatedFunctionParams)[i] = make_float4((float) func.min, (float) func.max, (float) (func.coefficients->_length/(func.max-func.min)), (float) (func.coefficients->_length-1));
-            functions[func.name] = fp;
-        }
-    }
-    gpu->psTabulatedFunctionParams->Upload();
-
-    // Create the Expressions.
-
-    vector<string> variables;
-    for (int j = 1; j < 3; j++) {
-        for (int i = 0; i < (int) paramNames.size(); i++) {
-            stringstream name;
-            name << paramNames[i] << j;
-            variables.push_back(name.str());
-        }
-        for (int i = paramNames.size(); i < 4; i++)
-            variables.push_back("");
-    }
-    variables.push_back("r");
-    SetCustomNonbondedEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp, functions).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-    SetCustomNonbondedForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp, functions).differentiate("r").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
-    delete fp;
-}
-
-static void tabulateErfc(gpuContext gpu)
-{
-    int tableSize = 2048;
-    gpu->sim.tabulatedErfcSize = tableSize;
-    gpu->sim.tabulatedErfcScale = tableSize/(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff);
-    gpu->psTabulatedErfc = new CUDAStream<float>(tableSize, 1, "TabulatedErfc");
-    gpu->sim.pTabulatedErfc = gpu->psTabulatedErfc->_pDevData;
-    for (int i = 0; i < tableSize; ++i)
-        (*gpu->psTabulatedErfc)[i] = (float) erfc(i*(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff)/tableSize);
-    gpu->psTabulatedErfc->Upload();
-}
-
-extern "C"
-void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz)
-{
-    gpu->sim.alphaEwald         = alpha;
-    gpu->sim.factorEwald        = -1 / (4*alpha*alpha);
-    gpu->sim.kmaxX              = kmaxx;
-    gpu->sim.kmaxY              = kmaxy;
-    gpu->sim.kmaxZ              = kmaxz;
-    gpu->psEwaldCosSinSum       = new CUDAStream<float2>((gpu->sim.kmaxX*2-1) * (gpu->sim.kmaxY*2-1) * (gpu->sim.kmaxZ*2-1), 1, "EwaldCosSinSum");
-    gpu->sim.pEwaldCosSinSum    = gpu->psEwaldCosSinSum->_pDevStream[0];
-    tabulateErfc(gpu);
-}
-
-extern "C"
-void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ)
-{
-    gpu->sim.alphaEwald         = alpha;
-    int3 gridSize = make_int3(gridSizeX, gridSizeY, gridSizeZ);
-    gpu->sim.pmeGridSize = gridSize;
-    int3 groupSize = make_int3(2, 4, 4);
-    gpu->sim.pmeGroupSize = groupSize;
-    const int3 numGroups = make_int3((gridSize.x+groupSize.x-1)/groupSize.x, (gridSize.y+groupSize.y-1)/groupSize.y, (gridSize.z+groupSize.z-1)/groupSize.z);
-    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
-    cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
-    gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
-    gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
-    gpu->psPmeBsplineModuli[0] = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
-    gpu->sim.pPmeBsplineModuli[0] = gpu->psPmeBsplineModuli[0]->_pDevData;
-    gpu->psPmeBsplineModuli[1] = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
-    gpu->sim.pPmeBsplineModuli[1] = gpu->psPmeBsplineModuli[1]->_pDevData;
-    gpu->psPmeBsplineModuli[2] = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
-    gpu->sim.pPmeBsplineModuli[2] = gpu->psPmeBsplineModuli[2]->_pDevData;
-    gpu->psPmeBsplineTheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineTheta");
-    gpu->sim.pPmeBsplineTheta = gpu->psPmeBsplineTheta->_pDevData;
-    gpu->psPmeBsplineDtheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineDtheta");
-    gpu->sim.pPmeBsplineDtheta = gpu->psPmeBsplineDtheta->_pDevData;
-    gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
-    gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
-    gpu->psPmeAtomGridIndex = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
-    gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
-    tabulateErfc(gpu);
-
-    // Initialize the b-spline moduli.
-
-    int maxSize = max(max(gridSize.x, gridSize.y), gridSize.z);
-    vector<double> data(PME_ORDER);
-    vector<double> ddata(PME_ORDER);
-    vector<double> bsplines_data(maxSize);
-    data[PME_ORDER-1] = 0.0;
-    data[1] = 0.0;
-    data[0] = 1.0;
-    for (int i = 3; i < PME_ORDER; i++)
-    {
-        double div = 1.0/(i-1.0);
-        data[i-1] = 0.0;
-        for (int j = 1; j < (i-1); j++)
-            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
-        data[0] = div*data[0];
-    }
-
-    // Differentiate.
-
-    ddata[0] = -data[0];
-    for (int i = 1; i < PME_ORDER; i++)
-        ddata[i] = data[i-1]-data[i];
-    double div = 1.0/(PME_ORDER-1);
-    data[PME_ORDER-1] = 0.0;
-    for (int i = 1; i < (PME_ORDER-1); i++)
-        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
-    data[0] = div*data[0];
-    for (int i = 0; i < maxSize; i++)
-        bsplines_data[i] = 0.0;
-    for (int i = 1; i <= PME_ORDER; i++)
-        bsplines_data[i] = data[i-1];
-
-    // Evaluate the actual bspline moduli for X/Y/Z.
-
-    for(int dim = 0; dim < 3; dim++)
-    {
-        int ndata = (dim == 0 ? gridSize.x : dim == 1 ? gridSize.y : gridSize.z);
-        for (int i = 0; i < ndata; i++)
-        {
-            double sc = 0.0;
-            double ss = 0.0;
-            for (int j = 0; j < ndata; j++)
-            {
-                double arg = (2.0*M_PI*i*j)/ndata;
-                sc += bsplines_data[j]*cos(arg);
-                ss += bsplines_data[j]*sin(arg);
-            }
-            (*gpu->psPmeBsplineModuli[dim])[i] = (float) (sc*sc+ss*ss);
-        }
-        for (int i = 0; i < ndata; i++)
-        {
-            if ((*gpu->psPmeBsplineModuli[dim])[i] < 1.0e-7)
-                (*gpu->psPmeBsplineModuli[dim])[i] = ((*gpu->psPmeBsplineModuli[dim])[i-1]+(*gpu->psPmeBsplineModuli[dim])[i+1])*0.5f;
-        }
-        gpu->psPmeBsplineModuli[dim]->Upload();
-    }
-}
-
-extern "C"
-void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize)
-{
-    gpu->sim.periodicBoxSizeX = xsize;
-    gpu->sim.periodicBoxSizeY = ysize;
-    gpu->sim.periodicBoxSizeZ = zsize;
-    gpu->sim.invPeriodicBoxSizeX = 1.0f/xsize;
-    gpu->sim.invPeriodicBoxSizeY = 1.0f/ysize;
-    gpu->sim.invPeriodicBoxSizeZ = 1.0f/zsize;
-    gpu->sim.recipBoxSizeX = 2.0f*PI/gpu->sim.periodicBoxSizeX;
-    gpu->sim.recipBoxSizeY = 2.0f*PI/gpu->sim.periodicBoxSizeY;
-    gpu->sim.recipBoxSizeZ = 2.0f*PI/gpu->sim.periodicBoxSizeZ;
-    gpu->sim.cellVolume = gpu->sim.periodicBoxSizeX*gpu->sim.periodicBoxSizeY*gpu->sim.periodicBoxSizeZ;
-}
-
-extern "C"
-void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<float>& radius, const vector<float>& scale, const vector<float>& charge)
-{
-    unsigned int atoms = radius.size();
-
-    gpu->bIncludeGBSA = true;
-    for (unsigned int i = 0; i < atoms; i++)
-    {
-            (*gpu->psObcData)[i].x = radius[i] - dielectricOffset;
-            (*gpu->psObcData)[i].y = scale[i] * (*gpu->psObcData)[i].x;
-            (*gpu->psPosq4)[i].w = charge[i];
-
-#if (DUMP_PARAMETERS == 1)
-        cout << 
-            i << " " << 
-            (*gpu->psObcData)[i].x << " " <<
-            (*gpu->psObcData)[i].y;
-#endif
-    }
-
-    // Dummy out extra atom data
-    for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
-    {
-        (*gpu->psBornRadii)[i]     = 0.2f;
-        (*gpu->psObcData)[i].x     = 0.01f;
-        (*gpu->psObcData)[i].y     = 0.01f;
-    }
-
-    gpu->psBornRadii->Upload();
-    gpu->psObcData->Upload();
-    gpu->psPosq4->Upload();
-    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
-}
-
-extern "C"
-void gpuSetGBVIParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<int>& atom, const vector<float>& radius, 
-                          const vector<float>& gamma, const vector<float>& scaledRadii, int bornRadiusScalingMethod, float quinticLowerLimitFactor,
-                          float quinticUpperBornRadiusLimit )
-{
-    unsigned int atoms = atom.size();
-    gpu->bIncludeGBVI  = true;
-    double tau         = ((1.0f/innerDielectric)-(1.0f/solventDielectric)); 
-
-    gpu->sim.gbviQuinticLowerLimitFactor     = quinticLowerLimitFactor;
-    gpu->sim.gbviQuinticUpperBornRadiusLimit = quinticUpperBornRadiusLimit;
-    gpu->sim.gbviBornRadiusScalingMethod     = bornRadiusScalingMethod; 
-
-    for (unsigned int i = 0; i < atoms; i++)
-    {
-            (*gpu->psGBVIData)[i].x            = radius[i];
-            (*gpu->psGBVIData)[i].y            = scaledRadii[i];
-            (*gpu->psGBVIData)[i].z            = (float) (tau*gamma[i]);
-            (*gpu->psGBVIData)[i].w            = 1.0f;
-            (*gpu->psGBVISwitchDerivative)[i]  = 1.0f;
-
-#define DUMP_PARAMETERS 0
-#if (DUMP_PARAMETERS == 1)
-        (void) fprintf( stderr,"GBVI param: %5u R=%15.7e scaledR=%15.7e R-S=%15.7e gamma*tau=%15.7e bornRadiusScaleFactor=%15.7e\n",
-                        i, (*gpu->psGBVIData)[i].x, (*gpu->psGBVIData)[i].y,  (*gpu->psGBVIData)[i].x - (*gpu->psGBVIData)[i].y,
-                        (*gpu->psGBVIData)[i].z, (*gpu->psGBVIData)[i].w ); 
-#endif
-#undef DUMP_PARAMETERS
-    }
-    // Dummy out extra atom data
-    for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
-    {
-        (*gpu->psBornRadii)[i]              = 0.2f;
-        (*gpu->psGBVIData)[i].x             = 0.01f;
-        (*gpu->psGBVIData)[i].y             = 0.01f;
-        (*gpu->psGBVIData)[i].z             = 0.01f;
-        (*gpu->psGBVIData)[i].w             = 1.00f;
-        (*gpu->psGBVISwitchDerivative)[i]   = 1.0f;
-    }
-
-    gpu->psBornRadii->Upload();
-    gpu->psGBVIData->Upload();
-    gpu->psGBVISwitchDerivative->Upload();
-gpu->psObcData->Upload();
-    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
-
-#if (DUMP_PARAMETERS == 1)
-(void) fprintf( stderr, "gpuSetGBVIParameters: preFactor=%14.6e elecCnstnt=%.4f frcCnvrsnFctr=%.4f tau=%.4f.\n",
-                gpu->sim.preFactor, 2.0f*electricConstant, gpu->sim.forceConversionFactor, ((1.0f/innerDielectric)-(1.0f/solventDielectric)) );
-#endif
-}
-
-static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
-{
-    cluster.valid = false;
-    invalidForShake[cluster.centralID] = true;
-    for (int i = 0; i < cluster.size; i++) {
-        invalidForShake[cluster.peripheralID[i]] = true;
-        map<int, ShakeCluster>::iterator otherCluster = allClusters.find(cluster.peripheralID[i]);
-        if (otherCluster != allClusters.end() && otherCluster->second.valid)
-            markShakeClusterInvalid(otherCluster->second, allClusters, invalidForShake);
-    }
-}
-
-extern "C"
-void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
-        const vector<float>& invMass1, const vector<float>& invMass2, float constraintTolerance)
-{
-    // Create a vector for recording which atoms are handled by SHAKE (or SETTLE).
-
-    vector<bool> isShakeAtom(gpu->natoms, false);
-
-    // Find how many constraints each atom is involved in.
-    
-    vector<int> constraintCount(gpu->natoms, 0);
-    for (int i = 0; i < (int)atom1.size(); i++) {
-        constraintCount[atom1[i]]++;
-        constraintCount[atom2[i]]++;
-    }
-
-    // Identify clusters of three atoms that can be treated with SETTLE.  First, for every
-    // atom that might be part of such a cluster, make a list of the two other atoms it is
-    // connected to.
-
-    vector<map<int, float> > settleConstraints(gpu->natoms);
-    for (int i = 0; i < (int)atom1.size(); i++) {
-        if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
-            settleConstraints[atom1[i]][atom2[i]] = distance[i];
-            settleConstraints[atom2[i]][atom1[i]] = distance[i];
-        }
-    }
-
-    // Now remove the ones that don't actually form closed loops of three atoms.
-
-    vector<int> settleClusters;
-    for (int i = 0; i < (int)settleConstraints.size(); i++) {
-        if (settleConstraints[i].size() == 2) {
-            int partner1 = settleConstraints[i].begin()->first;
-            int partner2 = (++settleConstraints[i].begin())->first;
-            if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
-                    settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
-                settleConstraints[i].clear();
-            else if (i < partner1 && i < partner2)
-                settleClusters.push_back(i);
-        }
-        else
-            settleConstraints[i].clear();
-    }
-
-    // Record the actual SETTLE clusters.
-
-    CUDAStream<int4>* psSettleID          = new CUDAStream<int4>((int) settleClusters.size(), 1, "SettleID");
-    gpu->psSettleID                       = psSettleID;
-    gpu->sim.pSettleID                    = psSettleID->_pDevStream[0];
-    CUDAStream<float2>* psSettleParameter = new CUDAStream<float2>((int) settleClusters.size(), 1, "SettleParameter");
-    gpu->psSettleParameter                = psSettleParameter;
-    gpu->sim.pSettleParameter             = psSettleParameter->_pDevStream[0];
-    gpu->sim.settleConstraints            = settleClusters.size();
-      for (int i = 0; i < (int)settleClusters.size(); i++) {
-        int atom1 = settleClusters[i];
-        int atom2 = settleConstraints[atom1].begin()->first;
-        int atom3 = (++settleConstraints[atom1].begin())->first;
-        float dist12 = settleConstraints[atom1].find(atom2)->second;
-        float dist13 = settleConstraints[atom1].find(atom3)->second;
-        float dist23 = settleConstraints[atom2].find(atom3)->second;
-        if (dist12 == dist13) { // atom1 is the central atom
-            (*psSettleID)[i].x = atom1;
-            (*psSettleID)[i].y = atom2;
-            (*psSettleID)[i].z = atom3;
-            (*psSettleParameter)[i].x = dist12;
-            (*psSettleParameter)[i].y = dist23;
-        }
-        else if (dist12 == dist23) { // atom2 is the central atom
-            (*psSettleID)[i].x = atom2;
-            (*psSettleID)[i].y = atom1;
-            (*psSettleID)[i].z = atom3;
-            (*psSettleParameter)[i].x = dist12;
-            (*psSettleParameter)[i].y = dist13;
-        }
-        else if (dist13 == dist23) { // atom3 is the central atom
-            (*psSettleID)[i].x = atom3;
-            (*psSettleID)[i].y = atom1;
-            (*psSettleID)[i].z = atom2;
-            (*psSettleParameter)[i].x = dist13;
-            (*psSettleParameter)[i].y = dist12;
-        }
-        else
-            throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
-        isShakeAtom[atom1] = true;
-        isShakeAtom[atom2] = true;
-        isShakeAtom[atom3] = true;
-    }
-    psSettleID->Upload();
-    psSettleParameter->Upload();
-    gpu->sim.settle_threads_per_block     = (gpu->sim.settleConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
-    if (gpu->sim.settle_threads_per_block > gpu->sim.max_shake_threads_per_block)
-        gpu->sim.settle_threads_per_block = gpu->sim.max_shake_threads_per_block;
-    if (gpu->sim.settle_threads_per_block < 1)
-        gpu->sim.settle_threads_per_block = 1;
-
-    // Find clusters consisting of a central atom with up to three peripheral atoms.
-
-    map<int, ShakeCluster> clusters;
-    vector<bool> invalidForShake(gpu->natoms, false);
-    for (int i = 0; i < (int)atom1.size(); i++) {
-        if (isShakeAtom[atom1[i]])
-            continue; // This is being taken care of with SETTLE.
-
-        // Determine which is the central atom.
-
-        bool firstIsCentral;
-        if (constraintCount[atom1[i]] > 1)
-            firstIsCentral = true;
-        else if (constraintCount[atom2[i]] > 1)
-            firstIsCentral = false;
-        else if (atom1[i] < atom2[i])
-            firstIsCentral = true;
-        else
-            firstIsCentral = false;
-        int centralID, peripheralID;
-        float centralInvMass, peripheralInvMass;
-        if (firstIsCentral) {
-            centralID = atom1[i];
-            peripheralID = atom2[i];
-            centralInvMass = invMass1[i];
-            peripheralInvMass = invMass2[i];
-        }
-        else {
-            centralID = atom2[i];
-            peripheralID = atom1[i];
-            centralInvMass = invMass2[i];
-            peripheralInvMass = invMass1[i];
-        }
-
-        // Add it to the cluster.
-
-        if (clusters.find(centralID) == clusters.end()) {
-            clusters[centralID] = ShakeCluster(centralID, centralInvMass);
-        }
-        ShakeCluster& cluster = clusters[centralID];
-        cluster.addAtom(peripheralID, distance[i], peripheralInvMass);
-        if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
-            markShakeClusterInvalid(cluster, clusters, invalidForShake);
-            map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
-            if (otherCluster != clusters.end() && otherCluster->second.valid)
-                markShakeClusterInvalid(otherCluster->second, clusters, invalidForShake);
-        }
-    }
-    int validShakeClusters = 0;
-    for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
-        ShakeCluster& cluster = iter->second;
-        if (cluster.valid) {
-            cluster.valid = !invalidForShake[cluster.centralID] && cluster.size == constraintCount[cluster.centralID];
-            for (int i = 0; i < cluster.size; i++)
-                if (invalidForShake[cluster.peripheralID[i]])
-                    cluster.valid = false;
-            if (cluster.valid)
-                ++validShakeClusters;
-        }
-    }
-
-    // Fill in the Cuda streams.
-
-    CUDAStream<int4>* psShakeID             = new CUDAStream<int4>(validShakeClusters, 1, "ShakeID");
-    gpu->psShakeID                          = psShakeID;
-    gpu->sim.pShakeID                       = psShakeID->_pDevStream[0];
-    CUDAStream<float4>* psShakeParameter    = new CUDAStream<float4>(validShakeClusters, 1, "ShakeParameter");
-    gpu->psShakeParameter                   = psShakeParameter;
-    gpu->sim.pShakeParameter                = psShakeParameter->_pDevStream[0];
-    gpu->sim.ShakeConstraints               = validShakeClusters;
-    int index = 0;
-    for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
-        const ShakeCluster& cluster = iter->second;
-        if (!cluster.valid)
-            continue;
-        (*psShakeID)[index].x = cluster.centralID;
-        (*psShakeID)[index].y = cluster.peripheralID[0];
-        (*psShakeID)[index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
-        (*psShakeID)[index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
-        (*psShakeParameter)[index].x = cluster.centralInvMass;
-        (*psShakeParameter)[index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
-        (*psShakeParameter)[index].z = cluster.distance*cluster.distance;
-        (*psShakeParameter)[index].w = cluster.peripheralInvMass;
-        isShakeAtom[cluster.centralID] = true;
-        isShakeAtom[cluster.peripheralID[0]] = true;
-        if (cluster.size > 1)
-            isShakeAtom[cluster.peripheralID[1]] = true;
-        if (cluster.size > 2)
-            isShakeAtom[cluster.peripheralID[2]] = true;
-        ++index;
-    }
-    psShakeID->Upload();
-    psShakeParameter->Upload();
-    gpu->sim.shakeTolerance = constraintTolerance;
-    gpu->sim.shake_threads_per_block     = (gpu->sim.ShakeConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
-    if (gpu->sim.shake_threads_per_block > gpu->sim.max_shake_threads_per_block)
-        gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
-    if (gpu->sim.shake_threads_per_block < 1)
-        gpu->sim.shake_threads_per_block = 1;
-
-    // Find connected constraints for CCMA.
-
-    vector<int> ccmaConstraints;
-    for (unsigned i = 0; i < atom1.size(); i++)
-        if (!isShakeAtom[atom1[i]])
-            ccmaConstraints.push_back(i);
-
-    // Record the connections between constraints.
-
-    int numCCMA = (int) ccmaConstraints.size();
-    vector<vector<int> > atomConstraints(gpu->natoms);
-    for (int i = 0; i < numCCMA; i++) {
-        atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
-        atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
-    }
-    vector<vector<int> > linkedConstraints(numCCMA);
-    for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
-        for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
-            for (unsigned j = 0; j < i; j++) {
-                int c1 = atomConstraints[atom][i];
-                int c2 = atomConstraints[atom][j];
-                linkedConstraints[c1].push_back(c2);
-                linkedConstraints[c2].push_back(c1);
-            }
-    }
-    int maxLinks = 0;
-    for (unsigned i = 0; i < linkedConstraints.size(); i++)
-        maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
-    int maxAtomConstraints = 0;
-    for (unsigned i = 0; i < atomConstraints.size(); i++)
-        maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
-
-    // Compute the constraint coupling matrix
-
-    vector<vector<int> > atomAngles(gpu->natoms);
-    for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
-        atomAngles[(*gpu->psBondAngleID1)[i].y].push_back(i);
-    vector<vector<pair<int, double> > > matrix(numCCMA);
-    if (numCCMA > 0) {
-        for (int j = 0; j < numCCMA; j++) {
-            for (int k = 0; k < numCCMA; k++) {
-                if (j == k) {
-                    matrix[j].push_back(pair<int, double>(j, 1.0));
-                    continue;
-                }
-                double scale;
-                int cj = ccmaConstraints[j];
-                int ck = ccmaConstraints[k];
-                int atomj0 = atom1[cj];
-                int atomj1 = atom2[cj];
-                int atomk0 = atom1[ck];
-                int atomk1 = atom2[ck];
-                int atoma, atomb, atomc;
-                if (atomj0 == atomk0) {
-                    atoma = atomj1;
-                    atomb = atomj0;
-                    atomc = atomk1;
-                    scale = invMass1[cj]/(invMass1[cj]+invMass2[cj]);
-                }
-                else if (atomj1 == atomk1) {
-                    atoma = atomj0;
-                    atomb = atomj1;
-                    atomc = atomk0;
-                    scale = invMass2[cj]/(invMass1[cj]+invMass2[cj]);
-                }
-                else if (atomj0 == atomk1) {
-                    atoma = atomj1;
-                    atomb = atomj0;
-                    atomc = atomk0;
-                    scale = invMass1[cj]/(invMass1[cj]+invMass2[cj]);
-                }
-                else if (atomj1 == atomk0) {
-                    atoma = atomj0;
-                    atomb = atomj1;
-                    atomc = atomk1;
-                    scale = invMass2[cj]/(invMass1[cj]+invMass2[cj]);
-                }
-                else
-                    continue; // These constraints are not connected.
-
-                // Look for a third constraint forming a triangle with these two.
-
-                bool foundConstraint = false;
-                for (int m = 0; m < numCCMA; m++) {
-                    int other = ccmaConstraints[m];
-                    if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
-                        double d1 = distance[cj];
-                        double d2 = distance[ck];
-                        double d3 = distance[other];
-                        matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
-                        foundConstraint = true;
-                        break;
-                    }
-                }
-                if (!foundConstraint) {
-                    // We didn't find one, so look for an angle force field term.
-
-                    const vector<int>& angleCandidates = atomAngles[atomb];
-                    for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
-                        int4 atoms = (*gpu->psBondAngleID1)[*iter];
-                        if ((atoms.x == atoma && atoms.z == atomc) || (atoms.z == atoma && atoms.x == atomc)) {
-                            double angle = (*gpu->psBondAngleParameter)[*iter].x;
-                            matrix[j].push_back(pair<int, double>(k, scale*cos(angle*PI/180.0)));
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-
-        // Invert it using QR.
-
-        vector<int> matrixRowStart;
-        vector<int> matrixColIndex;
-        vector<double> matrixValue;
-        for (int i = 0; i < numCCMA; i++) {
-            matrixRowStart.push_back(matrixValue.size());
-            for (int j = 0; j < (int) matrix[i].size(); j++) {
-                pair<int, double> element = matrix[i][j];
-                matrixColIndex.push_back(element.first);
-                matrixValue.push_back(element.second);
-            }
-        }
-        matrixRowStart.push_back(matrixValue.size());
-        int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
-        double *qValue, *rValue;
-        int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
-                &qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
-        vector<double> rhs(numCCMA);
-        matrix.clear();
-        matrix.resize(numCCMA);
-        for (int i = 0; i < numCCMA; i++) {
-            // Extract column i of the inverse matrix.
-
-            for (int j = 0; j < numCCMA; j++)
-                rhs[j] = (i == j ? 1.0 : 0.0);
-            result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
-            result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
-            for (int j = 0; j < numCCMA; j++) {
-                double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
-                if (abs(value) > 0.05)
-                    matrix[j].push_back(pair<int, double>(i, value));
-            }
-        }
-        QUERN_free_result(qRowStart, qColIndex, qValue);
-        QUERN_free_result(rRowStart, rColIndex, rValue);
-    }
-    int maxRowElements = 0;
-    for (unsigned i = 0; i < matrix.size(); i++)
-        maxRowElements = max(maxRowElements, (int) matrix[i].size());
-    maxRowElements++;
-
-    // Sort the constraints.
-
-    vector<int> constraintOrder(numCCMA);
-    for (int i = 0; i < numCCMA; ++i)
-        constraintOrder[i] = i;
-    sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2, ccmaConstraints));
-    vector<int> inverseOrder(numCCMA);
-    for (int i = 0; i < numCCMA; ++i)
-        inverseOrder[constraintOrder[i]] = i;
-    for (int i = 0; i < (int)matrix.size(); ++i)
-        for (int j = 0; j < (int)matrix[i].size(); ++j)
-            matrix[i][j].first = inverseOrder[matrix[i][j].first];
-
-    // Fill in the CUDA streams.
-
-    CUDAStream<int2>* psCcmaAtoms = new CUDAStream<int2>(numCCMA, 1, "CcmaAtoms");
-    gpu->psCcmaAtoms              = psCcmaAtoms;
-    gpu->sim.pCcmaAtoms           = psCcmaAtoms->_pDevData;
-    CUDAStream<float4>* psCcmaDistance = new CUDAStream<float4>(numCCMA, 1, "CcmaDistance");
-    gpu->psCcmaDistance                = psCcmaDistance;
-    gpu->sim.pCcmaDistance             = psCcmaDistance->_pDevData;
-    CUDAStream<int>* psCcmaAtomConstraints = new CUDAStream<int>(gpu->natoms*maxAtomConstraints, 1, "CcmaAtomConstraints");
-    gpu->psCcmaAtomConstraints             = psCcmaAtomConstraints;
-    gpu->sim.pCcmaAtomConstraints          = psCcmaAtomConstraints->_pDevData;
-    CUDAStream<int>* psCcmaNumAtomConstraints = new CUDAStream<int>(gpu->natoms, 1, "CcmaAtomConstraintsIndex");
-    gpu->psCcmaNumAtomConstraints             = psCcmaNumAtomConstraints;
-    gpu->sim.pCcmaNumAtomConstraints          = psCcmaNumAtomConstraints->_pDevData;
-    CUDAStream<float>* psCcmaDelta1 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta1");
-    gpu->psCcmaDelta1             = psCcmaDelta1;
-    gpu->sim.pCcmaDelta1          = psCcmaDelta1->_pDevData;
-    CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2");
-    gpu->psCcmaDelta2             = psCcmaDelta2;
-    gpu->sim.pCcmaDelta2          = psCcmaDelta2->_pDevData;
-    CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass");
-    gpu->psCcmaReducedMass             = psCcmaReducedMass;
-    gpu->sim.pCcmaReducedMass          = psCcmaReducedMass->_pDevData;
-    CUDAStream<unsigned int>* psConstraintMatrixColumn = new CUDAStream<unsigned int>(numCCMA*maxRowElements, 1, "ConstraintMatrixColumn");
-    gpu->psConstraintMatrixColumn               = psConstraintMatrixColumn;
-    gpu->sim.pConstraintMatrixColumn            = psConstraintMatrixColumn->_pDevData;
-    CUDAStream<float>* psConstraintMatrixValue = new CUDAStream<float>(numCCMA*maxRowElements, 1, "ConstraintMatrixValue");
-    gpu->psConstraintMatrixValue             = psConstraintMatrixValue;
-    gpu->sim.pConstraintMatrixValue          = psConstraintMatrixValue->_pDevData;
-    cudaHostAlloc((void**) &gpu->ccmaConvergedHostMarker, sizeof(int), cudaHostAllocMapped);
-    cudaHostGetDevicePointer((void**) &gpu->sim.ccmaConvergedDeviceMarker, (void*) gpu->ccmaConvergedHostMarker, 0);
-    cudaEventCreate(&gpu->ccmaEvent);
-    gpu->sim.ccmaConstraints = numCCMA;
-    for (int i = 0; i < numCCMA; i++) {
-        int index = constraintOrder[i];
-        int c = ccmaConstraints[index];
-        (*psCcmaAtoms)[i].x = atom1[c];
-        (*psCcmaAtoms)[i].y = atom2[c];
-        (*psCcmaDistance)[i].w = distance[c];
-        (*psCcmaReducedMass)[i] = 0.5f/(invMass1[c]+invMass2[c]);
-        for (unsigned int j = 0; j < matrix[index].size(); j++) {
-            (*psConstraintMatrixColumn)[i+j*numCCMA] = matrix[index][j].first;
-            (*psConstraintMatrixValue)[i+j*numCCMA] = (float) matrix[index][j].second;
-        }
-        (*psConstraintMatrixColumn)[i+matrix[index].size()*numCCMA] = numCCMA;
-    }
-    for (unsigned int i = 0; i < atomConstraints.size(); i++) {
-        (*psCcmaNumAtomConstraints)[i] = atomConstraints[i].size();
-        for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
-            bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
-            (*psCcmaAtomConstraints)[i+j*gpu->natoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
-        }
-    }
-    psCcmaAtoms->Upload();
-    psCcmaDistance->Upload();
-    psCcmaReducedMass->Upload();
-    psCcmaAtomConstraints->Upload();
-    psCcmaNumAtomConstraints->Upload();
-    psConstraintMatrixColumn->Upload();
-    psConstraintMatrixValue->Upload();
-    gpu->sim.ccma_threads_per_block = (gpu->sim.ccmaConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
-    if (gpu->sim.ccma_threads_per_block > gpu->sim.threads_per_block)
-        gpu->sim.ccma_threads_per_block = gpu->sim.threads_per_block;
-    if (gpu->sim.ccma_threads_per_block < gpu->sim.blocks)
-        gpu->sim.ccma_threads_per_block = gpu->sim.blocks;
-}
-
-extern "C"
-int gpuAllocateInitialBuffers(gpuContext gpu)
-{
-    gpu->sim.atoms                      = gpu->natoms;
-    gpu->sim.paddedNumberOfAtoms        = ((gpu->sim.atoms + GRID - 1) >> GRIDBITS) << GRIDBITS;
-    gpu->sim.degreesOfFreedom           = 3 * gpu->sim.atoms - 6;
-    gpu->gpAtomTable                    = NULL;
-    gpu->gAtomTypes                     = 0;
-    gpu->psPosq4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Posq");
-    gpu->sim.stride                     = gpu->psPosq4->_stride;
-    gpu->sim.stride2                    = gpu->sim.stride * 2;
-    gpu->sim.stride3                    = gpu->sim.stride * 3;
-    gpu->sim.stride4                    = gpu->sim.stride * 4;
-    gpu->sim.pPosq                      = gpu->psPosq4->_pDevStream[0];
-    gpu->sim.stride                     = gpu->psPosq4->_stride;
-    gpu->sim.stride2                    = 2 * gpu->sim.stride;
-    gpu->sim.stride3                    = 3 * gpu->sim.stride;
-    gpu->sim.stride4                    = 4 * gpu->sim.stride;
-    gpu->psPosqP4                       = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "PosqP");
-    gpu->sim.pPosqP                     = gpu->psPosqP4->_pDevStream[0];
-    gpu->psOldPosq4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "OldPosq");
-    gpu->sim.pOldPosq                   = gpu->psOldPosq4->_pDevStream[0];
-    gpu->psVelm4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Velm");
-    gpu->sim.pVelm4                     = gpu->psVelm4->_pDevStream[0];
-    gpu->psBornRadii                    = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "BornRadii");
-    gpu->sim.pBornRadii                 = gpu->psBornRadii->_pDevStream[0];
-    gpu->psObcChain                     = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "ObcChain");
-    gpu->sim.pObcChain                  = gpu->psObcChain->_pDevStream[0];
-    gpu->psSigEps2                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "SigEps2");
-    gpu->sim.pAttr                      = gpu->psSigEps2->_pDevStream[0];
-    gpu->psObcData                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData");
-    gpu->sim.pObcData                   = gpu->psObcData->_pDevStream[0];
-    gpu->psGBVIData                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "GBVIData");
-    gpu->sim.pGBVIData                  = gpu->psGBVIData->_pDevStream[0];
-    gpu->psGBVISwitchDerivative        = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "psGBVISwitchDerivative");
-    gpu->sim.pGBVISwitchDerivative      = gpu->psGBVISwitchDerivative->_pDevStream[0];
-    gpu->psStepSize                     = new CUDAStream<float2>(1, 1, "StepSize");
-    gpu->sim.pStepSize                  = gpu->psStepSize->_pDevStream[0];
-    (*gpu->psStepSize)[0] = make_float2(0.0f, 0.0f);
-    gpu->psStepSize->Upload();
-    gpu->psLangevinParameters           = new CUDAStream<float>(3, 1, "LangevinParameters");
-    gpu->sim.pLangevinParameters        = gpu->psLangevinParameters->_pDevStream[0];
-    gpu->pAtomSymbol                    = new unsigned char[gpu->natoms];
-    gpu->psAtomIndex                    = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1, "AtomIndex");
-    gpu->sim.pAtomIndex                 = gpu->psAtomIndex->_pDevStream[0];
-    for (int i = 0; i < (int) gpu->sim.paddedNumberOfAtoms; i++)
-        (*gpu->psAtomIndex)[i] = i;
-    gpu->psAtomIndex->Upload();
-    gpu->posCellOffsets.resize(gpu->natoms, make_int3(0, 0, 0));
-    gpu->sim.outputBuffers = 0;
-    // Determine randoms
-    gpu->seed                           = 1;
-    gpu->sim.randomFrames               = 20;
-    gpu->sim.randomIterations           = gpu->sim.randomFrames;
-    gpu->sim.randoms                    = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms;
-    gpu->sim.totalRandoms               = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
-    gpu->psRandom4                      = new CUDAStream<float4>(gpu->sim.totalRandoms, 1, "Random4");
-    gpu->psRandom2                      = new CUDAStream<float2>(gpu->sim.totalRandoms, 1, "Random2");
-    gpu->psRandomPosition               = new CUDAStream<int>(gpu->sim.blocks, 1, "RandomPosition");
-    gpu->psRandomSeed                   = new CUDAStream<uint4>(gpu->sim.blocks * gpu->sim.random_threads_per_block, 1, "RandomSeed");
-    gpu->sim.pRandom4                   = gpu->psRandom4->_pDevStream[0];
-    gpu->sim.pRandom2                   = gpu->psRandom2->_pDevStream[0];
-    gpu->sim.pRandomPosition            = gpu->psRandomPosition->_pDevStream[0];
-    gpu->sim.pRandomSeed                = gpu->psRandomSeed->_pDevStream[0];
-
-    // Allocate and clear linear momentum buffer
-    gpu->psLinearMomentum = new CUDAStream<float4>(gpu->sim.blocks, 1, "LinearMomentum");
-    gpu->sim.pLinearMomentum = gpu->psLinearMomentum->_pDevStream[0];
-    for (int i = 0; i < (int) gpu->sim.blocks; i++)
-    {
-        (*gpu->psLinearMomentum)[i].x = 0.0f;
-        (*gpu->psLinearMomentum)[i].y = 0.0f;
-        (*gpu->psLinearMomentum)[i].z = 0.0f;
-        (*gpu->psLinearMomentum)[i].w = 0.0f;
-    }
-    gpu->psLinearMomentum->Upload();
-
-    return 1;
-}
-
-extern "C"
-void gpuSetPositions(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
-{
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        (*gpu->psPosq4)[i].x = x[i];
-        (*gpu->psPosq4)[i].y = y[i];
-        (*gpu->psPosq4)[i].z = z[i];
-    }
-    gpu->psPosq4->Upload();
-
-	 // set flag to recalculate Born radii
-
-	 gpu->bRecalculateBornRadii = true;
-} 
-
-extern "C"
-void gpuSetVelocities(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
-{
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        (*gpu->psVelm4)[i].x = x[i];
-        (*gpu->psVelm4)[i].y = y[i];
-        (*gpu->psVelm4)[i].z = z[i];
-    }
-    gpu->psVelm4->Upload();
-} 
-
-extern "C"
-void gpuSetMass(gpuContext gpu, const vector<float>& mass)
-{
-    float totalMass = 0.0f;
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        (*gpu->psVelm4)[i].w = 1.0f/mass[i];
-        totalMass += mass[i];
-    }
-    gpu->sim.inverseTotalMass = 1.0f / totalMass;
-    gpu->psVelm4->Upload();
-} 
-
-extern "C"
-void gpuInitializeRandoms(gpuContext gpu)
-{
-    for (int i = 0; i < (int) gpu->sim.blocks; i++)
-    {
-        (*gpu->psRandomPosition)[i] = 0;
-    }
-    int seed = gpu->seed | ((gpu->seed ^ 0xffffffff) << 16);
-#if 0
-    srand(seed);
-    for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
-    {
-        (*gpu->psRandomSeed)[i].x = rand();
-        (*gpu->psRandomSeed)[i].y = rand();
-        (*gpu->psRandomSeed)[i].z = rand();
-        (*gpu->psRandomSeed)[i].w = rand();
-    }
-#else
-    RNG rng(seed);
-    for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
-    {
-        (*gpu->psRandomSeed)[i].x = rng.rand_int();
-        (*gpu->psRandomSeed)[i].y = rng.rand_int();
-        (*gpu->psRandomSeed)[i].z = rng.rand_int();
-        (*gpu->psRandomSeed)[i].w = rng.rand_int();
-    }
-#endif
-    gpu->psRandomPosition->Upload();
-    gpu->psRandomSeed->Upload();
-    gpuSetConstants(gpu);
-    kGenerateRandoms(gpu);
-    return;
-}
-
-extern "C"
-bool OPENMMCUDA_EXPORT gpuIsAvailable()
-{
-    int deviceCount;
-    cudaGetDeviceCount(&deviceCount);
-    return (deviceCount > 0);
-}
-
-extern "C"
-void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
-{
-    gpuContext gpu = new _gpuContext;
-    int LRFSize = 0;
-    int SMCount = 0;
-    int SMMajor = 0;
-    int SMMinor = 0;
-
-    // Select which device to use
-    int currentDevice;
-    cudaError_t status = cudaGetDevice(&currentDevice);
-    RTERROR(status, "Error getting CUDA device")
-    if (device != currentDevice)
-        cudaSetDevice(device); // Ignore errors
-    status = cudaGetDevice(&gpu->device);
-    RTERROR(status, "Error getting CUDA device")
-    status = cudaSetDeviceFlags(cudaDeviceMapHost+(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto));
-    RTERROR(status, "Error setting device flags")
-    gpu->useBlockingSync = useBlockingSync;
-
-    // Determine kernel call configuration
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, currentDevice);
-
-    // Determine SM version
-    if (deviceProp.major == 1)
-    {
-        switch (deviceProp.minor)
-        {
-        case 0:
-        case 1:
-            gpu->sm_version = SM_10;
-            gpu->sim.workUnitsPerSM = G8X_NONBOND_WORKUNITS_PER_SM;
-            break;
-
-        default:
-            gpu->sm_version = SM_12;
-            gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM;
-            break;
-        }
-    } 
-    else
-    {    
-        gpu->sm_version = SM_20;
-        gpu->sim.workUnitsPerSM = GF1XX_NONBOND_WORKUNITS_PER_SM;
-    }
-
-    if (deviceProp.regsPerBlock == 8192)
-    {
-        gpu->sim.nonbond_threads_per_block          = G8X_NONBOND_THREADS_PER_BLOCK;
-        gpu->sim.bornForce2_threads_per_block       = G8X_BORNFORCE2_THREADS_PER_BLOCK;
-        gpu->sim.max_shake_threads_per_block        = G8X_SHAKE_THREADS_PER_BLOCK;
-        gpu->sim.max_update_threads_per_block       = G8X_UPDATE_THREADS_PER_BLOCK;
-        gpu->sim.max_localForces_threads_per_block  = G8X_LOCALFORCES_THREADS_PER_BLOCK;
-        gpu->sim.threads_per_block                  = G8X_THREADS_PER_BLOCK;
-        gpu->sim.random_threads_per_block           = G8X_RANDOM_THREADS_PER_BLOCK;
-        gpu->blocksPerSM                            = G8X_BLOCKS_PER_SM;
-    }
-    else if (deviceProp.regsPerBlock <= 16384)
-    {
-        gpu->sim.nonbond_threads_per_block          = GT2XX_NONBOND_THREADS_PER_BLOCK;
-        gpu->sim.bornForce2_threads_per_block       = GT2XX_BORNFORCE2_THREADS_PER_BLOCK;
-        gpu->sim.max_shake_threads_per_block        = GT2XX_SHAKE_THREADS_PER_BLOCK;
-        gpu->sim.max_update_threads_per_block       = GT2XX_UPDATE_THREADS_PER_BLOCK;
-        gpu->sim.max_localForces_threads_per_block  = GT2XX_LOCALFORCES_THREADS_PER_BLOCK;
-        gpu->sim.threads_per_block                  = GT2XX_THREADS_PER_BLOCK;
-        gpu->sim.random_threads_per_block           = GT2XX_RANDOM_THREADS_PER_BLOCK;
-        gpu->blocksPerSM                            = GT2XX_BLOCKS_PER_SM;
-    }
-    else
-    {
-        gpu->sim.nonbond_threads_per_block          = GF1XX_NONBOND_THREADS_PER_BLOCK;
-        gpu->sim.bornForce2_threads_per_block       = GF1XX_BORNFORCE2_THREADS_PER_BLOCK;
-        gpu->sim.max_shake_threads_per_block        = GF1XX_SHAKE_THREADS_PER_BLOCK;
-        gpu->sim.max_update_threads_per_block       = GF1XX_UPDATE_THREADS_PER_BLOCK;
-        gpu->sim.max_localForces_threads_per_block  = GF1XX_LOCALFORCES_THREADS_PER_BLOCK;
-        gpu->sim.threads_per_block                  = GF1XX_THREADS_PER_BLOCK;
-        gpu->sim.random_threads_per_block           = GF1XX_RANDOM_THREADS_PER_BLOCK;
-        gpu->blocksPerSM                            = GF1XX_BLOCKS_PER_SM;
-    }
-    gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
-    gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
-    gpu->sim.blocks = deviceProp.multiProcessorCount;
-    gpu->sharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
-
-    gpu->sim.shake_threads_per_block                = gpu->sim.max_shake_threads_per_block;
-    gpu->sim.localForces_threads_per_block          = gpu->sim.max_localForces_threads_per_block;
-
-    gpu->natoms = numAtoms;
-    gpuAllocateInitialBuffers(gpu);
-
-    gpu->iterations = 0;
-    gpu->sim.update_threads_per_block               = (gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
-    if (gpu->sim.update_threads_per_block > gpu->sim.max_update_threads_per_block)
-        gpu->sim.update_threads_per_block = gpu->sim.max_update_threads_per_block;
-    if (gpu->sim.update_threads_per_block < gpu->psLangevinParameters->_length)
-            gpu->sim.update_threads_per_block = gpu->psLangevinParameters->_length;
-    gpu->sim.bf_reduce_threads_per_block = gpu->sim.update_threads_per_block;
-    gpu->sim.bsf_reduce_threads_per_block = (gpu->sim.stride4 + gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
-    gpu->sim.bsf_reduce_threads_per_block = ((gpu->sim.bsf_reduce_threads_per_block + (GRID - 1)) / GRID) * GRID;
-    if (gpu->sim.bsf_reduce_threads_per_block > gpu->sim.threads_per_block)
-        gpu->sim.bsf_reduce_threads_per_block = gpu->sim.threads_per_block;
-    if (gpu->sim.bsf_reduce_threads_per_block < 1)
-        gpu->sim.bsf_reduce_threads_per_block = 1;
-
-    // Initialize constants to reasonable values
-    gpu->sim.probeRadius            = probeRadius;
-    gpu->sim.surfaceAreaFactor      = surfaceAreaFactor;
-    gpu->sim.electricConstant       = electricConstant;
-    gpu->sim.nonbondedMethod        = NO_CUTOFF;
-    gpu->sim.nonbondedCutoff        = 0.0f;
-    gpu->sim.nonbondedCutoffSqr     = 0.0f;
-
-    gpu->sim.bigFloat               = 99999999.0f;
-    gpu->sim.forceConversionFactor  = forceConversionFactor;
-    gpu->sim.preFactor              = 2.0f*electricConstant*((1.0f/defaultInnerDielectric)-(1.0f/defaultSolventDielectric))*gpu->sim.forceConversionFactor;
-    gpu->sim.dielectricOffset       = dielectricOffset;
-    gpu->sim.alphaOBC               = alphaOBC;
-    gpu->sim.betaOBC                = betaOBC;
-    gpu->sim.gammaOBC               = gammaOBC;
-    gpu->sim.maxShakeIterations     = 15;
-    gpu->sim.shakeTolerance         = 1.0e-04f * 2.0f;
-    gpu->sim.InvMassJ               = 9.920635e-001f;
-    gpu->grid                       = GRID;
-    gpu->bCalculateCM               = false;
-    gpu->bRemoveCM                  = false;
-    gpu->bRecalculateBornRadii      = true;
-    gpu->bIncludeGBSA               = false;
-    gpu->bIncludeGBVI               = false;
-    gpuInitializeRandoms(gpu);
-
-    // To be determined later
-    gpu->psLJ14ID                   = NULL;
-    gpu->psForce4                   = NULL;
-    gpu->psEnergy                   = NULL;
-    gpu->sim.pForce4                = NULL;
-    gpu->psBornForce                = NULL;
-    gpu->sim.pBornForce             = NULL;
-    gpu->psBornSum                  = NULL;
-    gpu->sim.pBornSum               = NULL;
-    gpu->psBondID                   = NULL;
-    gpu->psBondParameter            = NULL;
-    gpu->psBondAngleID1             = NULL;
-    gpu->psBondAngleID2             = NULL;
-    gpu->psBondAngleParameter       = NULL;
-    gpu->psDihedralID1              = NULL;
-    gpu->psDihedralID2              = NULL;
-    gpu->psDihedralParameter        = NULL;
-    gpu->psRbDihedralID1            = NULL;
-    gpu->psRbDihedralID2            = NULL;
-    gpu->psRbDihedralParameter1     = NULL;
-    gpu->psRbDihedralParameter2     = NULL;
-    gpu->psLJ14ID                   = NULL;
-    gpu->psLJ14Parameter            = NULL;
-    gpu->psCustomParams             = NULL;
-    gpu->psCustomBondID             = NULL;
-    gpu->psCustomBondParams         = NULL;
-    gpu->psCustomAngleID1           = NULL;
-    gpu->psCustomAngleID2           = NULL;
-    gpu->psCustomAngleParams        = NULL;
-    gpu->psCustomTorsionID1         = NULL;
-    gpu->psCustomTorsionID2         = NULL;
-    gpu->psCustomTorsionParams      = NULL;
-    gpu->psCustomExternalID         = NULL;
-    gpu->psCustomExternalParams     = NULL;
-    gpu->psEwaldCosSinSum           = NULL;
-    gpu->psTabulatedErfc            = NULL;
-    gpu->psPmeGrid                  = NULL;
-    gpu->psPmeBsplineModuli[0]      = NULL;
-    gpu->psPmeBsplineModuli[1]      = NULL;
-    gpu->psPmeBsplineModuli[2]      = NULL;
-    gpu->psPmeBsplineTheta          = NULL;
-    gpu->psPmeBsplineDtheta         = NULL;
-    gpu->psPmeAtomRange             = NULL;
-    gpu->psPmeAtomGridIndex         = NULL;
-    gpu->psShakeID                  = NULL;
-    gpu->psShakeParameter           = NULL;
-    gpu->psSettleID                 = NULL;
-    gpu->psSettleParameter          = NULL;
-    gpu->psExclusion                = NULL;
-    gpu->psExclusionIndex           = NULL;
-    gpu->psWorkUnit                 = NULL;
-    gpu->psInteractingWorkUnit      = NULL;
-    gpu->psInteractionFlag          = NULL;
-    gpu->psInteractionCount         = NULL;
-    gpu->psGridBoundingBox          = NULL;
-    gpu->psGridCenter               = NULL;
-    gpu->psCcmaAtoms                = NULL;
-    gpu->psCcmaDistance             = NULL;
-    gpu->psCcmaAtomConstraints      = NULL;
-    gpu->psCcmaNumAtomConstraints   = NULL;
-    gpu->psCcmaDelta1               = NULL;
-    gpu->psCcmaDelta2               = NULL;
-    gpu->psCcmaReducedMass          = NULL;
-    gpu->psConstraintMatrixColumn   = NULL;
-    gpu->psConstraintMatrixValue    = NULL;
-    gpu->psTabulatedFunctionParams  = NULL;
-    for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++)
-        gpu->tabulatedFunctions[i].coefficients = NULL;
-    gpu->sim.customExpressionStackSize = 0;
-    gpu->sim.customBonds = 0;
-    gpu->sim.customAngles = 0;
-    gpu->sim.customTorsions = 0;
-    
-    // Initialize output buffer before reading parameters
-    gpu->pOutputBufferCounter       = new unsigned int[gpu->sim.paddedNumberOfAtoms];
-    memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int));
-
-    return (void*)gpu;
-}
-
-extern "C"
-void gpuSetLangevinIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature, float errorTol) {
-    gpu->sim.deltaT                 = deltaT;
-    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
-    gpu->sim.errorTol               = errorTol;
-    gpu->sim.tau                    = tau;
-    gpu->sim.T                      = temperature;
-    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
-    double vscale = exp(-deltaT/tau);
-    double fscale = (1-vscale)*tau;
-    double noisescale = sqrt(2*gpu->sim.kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
-    (*gpu->psLangevinParameters)[0] = (float) vscale;
-    (*gpu->psLangevinParameters)[1] = (float) fscale;
-    (*gpu->psLangevinParameters)[2] = (float) noisescale;
-    gpu->psLangevinParameters->Upload();
-    gpu->psStepSize->Download();
-    if ((*gpu->psStepSize)[0].x == 0)
-        (*gpu->psStepSize)[0].x = deltaT;
-    (*gpu->psStepSize)[0].y = deltaT;
-    gpu->psStepSize->Upload();
-}
-
-extern "C"
-void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT, float errorTol) {
-    gpu->sim.deltaT                 = deltaT;
-    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
-    gpu->sim.errorTol               = errorTol;
-    gpu->psStepSize->Download();
-    if ((*gpu->psStepSize)[0].x == 0)
-        (*gpu->psStepSize)[0].x = deltaT;
-    (*gpu->psStepSize)[0].y = deltaT;
-    gpu->psStepSize->Upload();
-}
-
-extern "C"
-void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature) {
-    gpu->sim.deltaT                 = deltaT;
-    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
-    gpu->sim.tau                    = tau;
-    gpu->sim.tauDeltaT              = gpu->sim.deltaT * gpu->sim.tau;
-    gpu->sim.T                      = temperature;
-    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
-    gpu->sim.noiseAmplitude         = sqrt(2.0f*gpu->sim.kT*deltaT*tau);
-    gpu->psStepSize->Download();
-    if ((*gpu->psStepSize)[0].x == 0)
-        (*gpu->psStepSize)[0].x = deltaT;
-    (*gpu->psStepSize)[0].y = deltaT;
-    gpu->psStepSize->Upload();
-}
-
-extern "C"
-void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionFrequency) {
-    gpu->sim.T                      = temperature;
-    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
-    gpu->sim.collisionFrequency     = collisionFrequency;
-}
-
-extern "C"
-void gpuShutDown(gpuContext gpu)
-{
-    // Delete sysmem pointers
-    delete[] gpu->pOutputBufferCounter;
-    delete[] gpu->gpAtomTable;
-    delete[] gpu->pAtomSymbol;
-
-    // Delete device pointers
-    delete gpu->psPosq4;
-    delete gpu->psPosqP4;
-    delete gpu->psOldPosq4;
-    delete gpu->psVelm4;
-    delete gpu->psForce4;
-    delete gpu->psEnergy;
-    delete gpu->psSigEps2;
-    if (gpu->psCustomParams != NULL)
-        delete gpu->psCustomParams;
-    if (gpu->psCustomBondParams != NULL) {
-        delete gpu->psCustomBondID;
-        delete gpu->psCustomBondParams;
-    }
-    if (gpu->psCustomAngleParams != NULL) {
-        delete gpu->psCustomAngleID1;
-        delete gpu->psCustomAngleID2;
-        delete gpu->psCustomAngleParams;
-    }
-    if (gpu->psCustomTorsionParams != NULL) {
-        delete gpu->psCustomTorsionID1;
-        delete gpu->psCustomTorsionID2;
-        delete gpu->psCustomTorsionParams;
-    }
-    if (gpu->psCustomExternalParams != NULL) {
-        delete gpu->psCustomExternalID;
-        delete gpu->psCustomExternalParams;
-    }
-    if (gpu->psEwaldCosSinSum != NULL)
-        delete gpu->psEwaldCosSinSum;
-    if (gpu->psPmeGrid != NULL) {
-        delete gpu->psPmeGrid;
-        delete gpu->psPmeBsplineModuli[0];
-        delete gpu->psPmeBsplineModuli[1];
-        delete gpu->psPmeBsplineModuli[2];
-        delete gpu->psPmeBsplineTheta;
-        delete gpu->psPmeBsplineDtheta;
-        delete gpu->psPmeAtomRange;
-        delete gpu->psPmeAtomGridIndex;
-        cufftDestroy(gpu->fftplan);
-    }
-    if (gpu->psTabulatedErfc != NULL)
-        delete gpu->psTabulatedErfc;
-    delete gpu->psObcData;
-    delete gpu->psGBVIData;
-    delete gpu->psGBVISwitchDerivative;
-    delete gpu->psObcChain;
-    delete gpu->psBornForce;
-    delete gpu->psBornRadii;
-    delete gpu->psBornSum;
-    delete gpu->psBondID;
-    delete gpu->psBondParameter;
-    delete gpu->psBondAngleID1;
-    delete gpu->psBondAngleID2;
-    delete gpu->psBondAngleParameter;
-    delete gpu->psDihedralID1;
-    delete gpu->psDihedralID2;
-    delete gpu->psDihedralParameter;
-    delete gpu->psRbDihedralID1;
-    delete gpu->psRbDihedralID2;
-    delete gpu->psRbDihedralParameter1;
-    delete gpu->psRbDihedralParameter2;
-    delete gpu->psLJ14ID;
-    delete gpu->psLJ14Parameter;
-    delete gpu->psShakeID;
-    delete gpu->psShakeParameter;
-    delete gpu->psSettleID;
-    delete gpu->psSettleParameter;
-    delete gpu->psExclusion;
-    delete gpu->psExclusionIndex;
-    delete gpu->psWorkUnit;
-    delete gpu->psInteractingWorkUnit;
-    delete gpu->psInteractionFlag;
-    delete gpu->psInteractionCount;
-    delete gpu->psStepSize;
-    delete gpu->psLangevinParameters;
-    delete gpu->psRandom4;
-    delete gpu->psRandom2;
-    delete gpu->psRandomPosition;    
-    delete gpu->psRandomSeed;
-    delete gpu->psLinearMomentum;
-    delete gpu->psAtomIndex;
-    delete gpu->psGridBoundingBox;
-    delete gpu->psGridCenter;
-    delete gpu->psCcmaAtoms;
-    delete gpu->psCcmaDistance;
-    delete gpu->psCcmaAtomConstraints;
-    delete gpu->psCcmaNumAtomConstraints;
-    delete gpu->psCcmaDelta1;
-    delete gpu->psCcmaDelta2;
-    delete gpu->psCcmaReducedMass;
-    cudaEventDestroy(gpu->ccmaEvent);
-    delete gpu->psConstraintMatrixColumn;
-    delete gpu->psConstraintMatrixValue;
-    delete gpu->psTabulatedFunctionParams;
-    for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++)
-        if (gpu->tabulatedFunctions[i].coefficients != NULL)
-            delete gpu->tabulatedFunctions[i].coefficients;
-    if (gpu->compactPlan.valid)
-        destroyCompactionPlan(gpu->compactPlan);
-
-    // Wrap up
-    delete gpu;
-    cudaThreadExit();
-    return;
-}
-
-extern "C"
-int gpuBuildOutputBuffers(gpuContext gpu)
-{
-    // Select the number of output buffer to use.
-    gpu->bOutputBufferPerWarp           = true;
-    gpu->sim.nonbondOutputBuffers       = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block / GRID;
-    if (gpu->sim.nonbondOutputBuffers >= gpu->sim.paddedNumberOfAtoms/GRID)
-    {
-        // For small systems, it is more efficient to have one output buffer per block of 32 atoms instead of one per warp.
-        gpu->bOutputBufferPerWarp           = false;
-        gpu->sim.nonbondOutputBuffers       = gpu->sim.paddedNumberOfAtoms / GRID;
-    }
-    if (gpu->sim.nonbondOutputBuffers > gpu->sim.outputBuffers)
-        gpu->sim.outputBuffers = gpu->sim.nonbondOutputBuffers;
-
-    unsigned int outputBuffers = gpu->sim.outputBuffers;
-    for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
-    {
-        if (outputBuffers < gpu->pOutputBufferCounter[i])
-        {
-            outputBuffers = gpu->pOutputBufferCounter[i];
-        }
-    }    
-    gpu->sim.outputBuffers      = outputBuffers;
-    gpu->sim.energyOutputBuffers = max(gpu->sim.nonbond_threads_per_block, gpu->sim.localForces_threads_per_block)*gpu->sim.blocks;
-    gpu->psForce4               = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers, "Force");
-    gpu->psEnergy               = new CUDAStream<float>(gpu->sim.energyOutputBuffers, 1, "Energy");
-    gpu->psBornForce            = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce");
-    gpu->psBornSum              = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum");
-    gpu->sim.pForce4            = gpu->psForce4->_pDevStream[0];
-    gpu->sim.pEnergy            = gpu->psEnergy->_pDevStream[0];
-    gpu->sim.pBornForce         = gpu->psBornForce->_pDevStream[0];
-    gpu->sim.pBornSum           = gpu->psBornSum->_pDevStream[0];
-
-    // Determine local energy paramter offsets for bonded interactions
-    gpu->sim.bond_offset        =                                  gpu->psBondParameter->_stride;
-    gpu->sim.bond_angle_offset  = gpu->sim.bond_offset           + gpu->psBondAngleParameter->_stride;
-    gpu->sim.dihedral_offset    = gpu->sim.bond_angle_offset     + gpu->psDihedralParameter->_stride;
-    gpu->sim.rb_dihedral_offset = gpu->sim.dihedral_offset       + gpu->psRbDihedralParameter1->_stride;
-    gpu->sim.LJ14_offset        = gpu->sim.rb_dihedral_offset    + gpu->psLJ14Parameter->_stride;
-    gpu->sim.localForces_threads_per_block  = (max(gpu->sim.LJ14_offset, gpu->sim.customBonds) / gpu->sim.blocks + 15) & 0xfffffff0;
-    if (gpu->sim.localForces_threads_per_block > gpu->sim.max_localForces_threads_per_block)
-        gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
-    if (gpu->sim.localForces_threads_per_block < 1)
-        gpu->sim.localForces_threads_per_block = 1;
-
-    // Flip local force output buffers
-    int flip = outputBuffers - 1;
-    for (int i = 0; i < (int) gpu->sim.bonds; i++)
-    {
-        (*gpu->psBondID)[i].z = flip - (*gpu->psBondID)[i].z;
-        (*gpu->psBondID)[i].w = flip - (*gpu->psBondID)[i].w;
-    }
-    for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
-    {
-        (*gpu->psBondAngleID1)[i].w = flip - (*gpu->psBondAngleID1)[i].w;
-        (*gpu->psBondAngleID2)[i].x = flip - (*gpu->psBondAngleID2)[i].x;
-        (*gpu->psBondAngleID2)[i].y = flip - (*gpu->psBondAngleID2)[i].y;
-    }
-    for (int i = 0; i < (int) gpu->sim.dihedrals; i++)
-    {
-        (*gpu->psDihedralID2)[i].x = flip - (*gpu->psDihedralID2)[i].x;
-        (*gpu->psDihedralID2)[i].y = flip - (*gpu->psDihedralID2)[i].y;
-        (*gpu->psDihedralID2)[i].z = flip - (*gpu->psDihedralID2)[i].z;
-        (*gpu->psDihedralID2)[i].w = flip - (*gpu->psDihedralID2)[i].w;
-    }
-    for (int i = 0; i < (int) gpu->sim.rb_dihedrals; i++)
-    {
-        (*gpu->psRbDihedralID2)[i].x = flip - (*gpu->psRbDihedralID2)[i].x;
-        (*gpu->psRbDihedralID2)[i].y = flip - (*gpu->psRbDihedralID2)[i].y;
-        (*gpu->psRbDihedralID2)[i].z = flip - (*gpu->psRbDihedralID2)[i].z;
-        (*gpu->psRbDihedralID2)[i].w = flip - (*gpu->psRbDihedralID2)[i].w;
-    }
-    for (int i = 0; i < (int) gpu->sim.LJ14s; i++)
-    {
-        (*gpu->psLJ14ID)[i].z = flip - (*gpu->psLJ14ID)[i].z;
-        (*gpu->psLJ14ID)[i].w = flip - (*gpu->psLJ14ID)[i].w;
-    }
-    gpu->psBondID->Upload();
-    gpu->psBondAngleID1->Upload();
-    gpu->psBondAngleID2->Upload();
-    gpu->psDihedralID2->Upload();
-    gpu->psRbDihedralID2->Upload();
-    gpu->psLJ14ID->Upload();
-
-    return 1;
-}
-
-extern "C"
-int gpuBuildThreadBlockWorkList(gpuContext gpu)
-{
-    const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
-    const unsigned int grid = gpu->grid;
-    const unsigned int dim = (atoms + (grid - 1)) / grid;
-    const unsigned int cells = dim * (dim + 1) / 2;
-    CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
-    unsigned int* pWorkList = psWorkUnit->_pSysData;
-    gpu->psWorkUnit = psWorkUnit;
-    gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
-    CUDAStream<unsigned int>* psInteractingWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "InteractingWorkUnit");
-    gpu->psInteractingWorkUnit = psInteractingWorkUnit;
-    gpu->sim.pInteractingWorkUnit = psInteractingWorkUnit->_pDevStream[0];
-    CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u, "InteractionFlag");
-    gpu->psInteractionFlag = psInteractionFlag;
-    gpu->sim.pInteractionFlag = psInteractionFlag->_pDevStream[0];
-    CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u, "InteractionCount");
-    gpu->psInteractionCount = psInteractionCount;
-    gpu->sim.pInteractionCount = psInteractionCount->_pDevStream[0];
-    CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u, "GridBoundingBox");
-    gpu->psGridBoundingBox = psGridBoundingBox;
-    gpu->sim.pGridBoundingBox = psGridBoundingBox->_pDevStream[0];
-    CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u, "GridCenter");
-    gpu->psGridCenter = psGridCenter;
-    gpu->sim.pGridCenter = psGridCenter->_pDevStream[0];
-    gpu->sim.nonbond_workBlock      = gpu->sim.nonbond_threads_per_block / GRID;
-    gpu->sim.bornForce2_workBlock   = gpu->sim.bornForce2_threads_per_block / GRID;
-    gpu->sim.workUnits = cells;
-
-    // Initialize the plan for doing stream compaction.
-    planCompaction(gpu->compactPlan);
-
-    // Increase block count if necessary for extra large molecules that would
-    // otherwise overflow the SM workunit buffers
-//    int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
-//    if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
-//    {
-//        gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
-//    }
-//    if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
-//    {
-//        gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
-//    }
-    gpu->sim.nbWorkUnitsPerBlock            = cells / gpu->sim.nonbond_blocks;
-    gpu->sim.nbWorkUnitsPerBlockRemainder   = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock;
-    gpu->sim.bf2WorkUnitsPerBlock           = cells / gpu->sim.bornForce2_blocks;
-    gpu->sim.bf2WorkUnitsPerBlockRemainder  = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock;
-    gpu->sim.interaction_threads_per_block = 64;
-    gpu->sim.interaction_blocks = (gpu->sim.workUnits + gpu->sim.interaction_threads_per_block - 1) / gpu->sim.interaction_threads_per_block;
-    if (gpu->sim.interaction_blocks > 8*gpu->sim.blocks)
-        gpu->sim.interaction_blocks = 8*gpu->sim.blocks;
-
-    // Decrease thread count for extra small molecules to spread computation
-    // across entire chip
-    int activeWorkUnits = gpu->sim.nonbond_blocks * gpu->sim.nonbond_workBlock;
-    if (activeWorkUnits > (int) cells)
-    {
-        int balancedWorkBlock                   = (cells + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks;
-        gpu->sim.nonbond_threads_per_block      = balancedWorkBlock * GRID;
-        gpu->sim.nonbond_workBlock              = balancedWorkBlock;
-    }
-    activeWorkUnits = gpu->sim.bornForce2_blocks * gpu->sim.bornForce2_workBlock;
-    if (activeWorkUnits > (int) cells)
-    {
-        int balancedWorkBlock                   = (cells + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks;
-        gpu->sim.bornForce2_threads_per_block   = balancedWorkBlock * GRID;
-        gpu->sim.bornForce2_workBlock           = balancedWorkBlock;
-    }
-
-    unsigned int count = 0;
-    for (unsigned int y = 0; y < dim; y++)
-    {
-        for (unsigned int x = y; x < dim; x++)
-        {
-            pWorkList[count] = (x << 17) | (y << 2);
-            count++;
-        }
-    }
-    (*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
-
-    gpu->psInteractionCount->Upload();
-    psWorkUnit->Upload();
-    gpuSetConstants(gpu);
-    return cells;
-}
-
-extern "C"
-void OPENMMCUDA_EXPORT gpuBuildExclusionList(gpuContext gpu)
-{
-    const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
-    const unsigned int grid = gpu->grid;
-    const unsigned int dim = atoms/grid;
-    unsigned int* pWorkList = gpu->psWorkUnit->_pSysData;
-
-    // Mark which work units have exclusions.
-
-    for (int atom1 = 0; atom1 < (int)gpu->exclusions.size(); ++atom1)
-    {
-        int x = atom1/grid;
-        for (int j = 0; j < (int)gpu->exclusions[atom1].size(); ++j)
-        {
-            int atom2 = gpu->exclusions[atom1][j];
-            int y = atom2/grid;
-            int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
-            pWorkList[cell] |= 1;
-        }
-    }
-    if ((int)gpu->sim.paddedNumberOfAtoms > gpu->natoms)
-    {
-        int lastBlock = gpu->natoms/grid;
-        for (int i = 0; i < (int)gpu->sim.workUnits; ++i)
-        {
-            int x = pWorkList[i]>>17;
-            int y = (pWorkList[i]>>2)&0x7FFF;
-            if (x == lastBlock || y == lastBlock)
-                pWorkList[i] |= 1;
-        }
-    }
-
-    // Build a list of indexes for the work units with exclusions.
-
-    CUDAStream<unsigned int>* psExclusionIndex = new CUDAStream<unsigned int>(gpu->sim.workUnits, 1u, "ExclusionIndex");
-    gpu->psExclusionIndex = psExclusionIndex;
-    unsigned int* pExclusionIndex = psExclusionIndex->_pSysData;
-    gpu->sim.pExclusionIndex = psExclusionIndex->_pDevData;
-    int numWithExclusions = 0;
-    for (int i = 0; i < (int)psExclusionIndex->_length; ++i)
-        if ((pWorkList[i]&1) == 1)
-            pExclusionIndex[i] = (numWithExclusions++)*grid;
-
-    // Record the exclusion data.
-
-    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(numWithExclusions*grid, 1u, "Exclusion");
-    gpu->psExclusion = psExclusion;
-    unsigned int* pExclusion = psExclusion->_pSysData;
-    gpu->sim.pExclusion = psExclusion->_pDevData;
-    for (int i = 0; i < (int)psExclusion->_length; ++i)
-        pExclusion[i] = 0xFFFFFFFF;
-    for (int atom1 = 0; atom1 < (int)gpu->exclusions.size(); ++atom1)
-    {
-        int x = atom1/grid;
-        int offset1 = atom1-x*grid;
-        for (int j = 0; j < (int)gpu->exclusions[atom1].size(); ++j)
-        {
-            int atom2 = gpu->exclusions[atom1][j];
-            int y = atom2/grid;
-            int offset2 = atom2-y*grid;
-            if (x > y)
-            {
-                int cell = x+y*dim-y*(y+1)/2;
-                pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
-            }
-            else
-            {
-                int cell = y+x*dim-x*(x+1)/2;
-                pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
-            }
-        }
-    }
-
-    // Mark all interactions that involve a padding atom as being excluded.
-
-    for (int atom1 = gpu->natoms; atom1 < (int)atoms; ++atom1)
-    {
-        int x = atom1/grid;
-        int offset1 = atom1-x*grid;
-        for (int atom2 = 0; atom2 < (int)atoms; ++atom2)
-        {
-            int y = atom2/grid;
-            int offset2 = atom2-y*grid;
-            if (x >= y)
-            {
-                int cell = x+y*dim-y*(y+1)/2;
-                pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
-            }
-            if (y >= x)
-            {
-                int cell = y+x*dim-x*(x+1)/2;
-                pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
-            }
-        }
-    }
-
-    psExclusion->Upload();
-    psExclusionIndex->Upload();
-    gpu->psWorkUnit->Upload();
-    gpuSetConstants(gpu);
-}
-
-extern "C"
-int gpuSetConstants(gpuContext gpu)
-{
-    SetCalculateCDLJForcesSim(gpu);
-    SetCalculateCDLJObcGbsaForces1Sim(gpu);
-    SetCalculateCustomNonbondedForcesSim(gpu);
-    SetCalculateCustomBondForcesSim(gpu);
-    SetCalculateCustomAngleForcesSim(gpu);
-    SetCalculateCustomTorsionForcesSim(gpu);
-    SetCalculateCustomExternalForcesSim(gpu);
-    SetCalculateLocalForcesSim(gpu);
-    SetCalculateObcGbsaBornSumSim(gpu);
-    SetCalculateGBVIBornSumSim(gpu);
-    SetCalculateObcGbsaForces2Sim(gpu);
-    SetCalculateGBVIForces2Sim(gpu);
-    SetCalculateAndersenThermostatSim(gpu);
-    SetCalculatePMESim(gpu);
-    SetForcesSim(gpu);
-    SetShakeHSim(gpu);
-    SetLangevinUpdateSim(gpu);
-    SetVerletUpdateSim(gpu);
-    SetBrownianUpdateSim(gpu);
-    SetSettleSim(gpu);
-    SetCCMASim(gpu);
-    SetRandomSim(gpu);
-    return 1;
-}
-
-static void tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds)
-{
-    // Recursively tag atoms as belonging to a particular molecule.
-
-    atomMolecule[atom] = molecule;
-    for (int i = 0; i < (int)atomBonds[atom].size(); i++)
-        if (atomMolecule[atomBonds[atom][i]] == -1)
-            tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
-}
-
-static void findMoleculeGroups(gpuContext gpu)
-{
-    // First make a list of constraints for future use.
-
-    vector<Constraint> constraints;
-    for (int i = 0; i < (int)gpu->sim.ShakeConstraints; i++)
-    {
-        int atom1 = (*gpu->psShakeID)[i].x;
-        int atom2 = (*gpu->psShakeID)[i].y;
-        int atom3 = (*gpu->psShakeID)[i].z;
-        int atom4 = (*gpu->psShakeID)[i].w;
-        float distance2 = (*gpu->psShakeParameter)[i].z;
-        constraints.push_back(Constraint(atom1, atom2, distance2));
-        if (atom3 != -1)
-            constraints.push_back(Constraint(atom1, atom3, distance2));
-        if (atom4 != -1)
-            constraints.push_back(Constraint(atom1, atom4, distance2));
-    }
-    for (int i = 0; i < (int)gpu->sim.settleConstraints; i++)
-    {
-        int atom1 = (*gpu->psSettleID)[i].x;
-        int atom2 = (*gpu->psSettleID)[i].y;
-        int atom3 = (*gpu->psSettleID)[i].z;
-        float distance12 = (*gpu->psSettleParameter)[i].x;
-        float distance23 = (*gpu->psSettleParameter)[i].y;
-        constraints.push_back(Constraint(atom1, atom2, distance12*distance12));
-        constraints.push_back(Constraint(atom1, atom3, distance12*distance12));
-        constraints.push_back(Constraint(atom2, atom3, distance23*distance23));
-    }
-    for (int i = 0; i < (int)gpu->sim.ccmaConstraints; i++)
-    {
-        int atom1 = (*gpu->psCcmaAtoms)[i].x;
-        int atom2 = (*gpu->psCcmaAtoms)[i].y;
-        float distance2 = (*gpu->psCcmaDistance)[i].w;
-        constraints.push_back(Constraint(atom1, atom2, distance2));
-    }
-
-    // First make a list of every other atom to which each atom is connect by a bond, constraint, or exclusion.
-
-    int numAtoms = gpu->natoms;
-    vector<vector<int> > atomBonds(numAtoms);
-    for (int i = 0; i < (int) gpu->forces.size(); i++) {
-        for (int j = 0; j < gpu->forces[i]->getNumParticleGroups(); j++) {
-            vector<int> particles;
-            gpu->forces[i]->getParticlesInGroup(j, particles);
-            for (int k = 0; k < (int) particles.size(); k++)
-                for (int m = 0; m < (int) particles.size(); m++)
-                    if (k != m)
-                        atomBonds[particles[k]].push_back(particles[m]);
-        }
-    }
-    for (int i = 0; i < (int)constraints.size(); i++)
-    {
-        int atom1 = constraints[i].atom1;
-        int atom2 = constraints[i].atom2;
-        atomBonds[atom1].push_back(atom2);
-        atomBonds[atom2].push_back(atom1);
-    }
-
-    // Now tag atoms by which molecule they belong to.
-
-    vector<int> atomMolecule(numAtoms, -1);
-    int numMolecules = 0;
-    for (int i = 0; i < numAtoms; i++)
-        if (atomMolecule[i] == -1)
-            tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
-    vector<vector<int> > atomIndices(numMolecules);
-    for (int i = 0; i < numAtoms; i++)
-        atomIndices[atomMolecule[i]].push_back(i);
-
-    // Construct a description of each molecule.
-
-    vector<Molecule> molecules(numMolecules);
-    for (int i = 0; i < numMolecules; i++)
-    {
-        molecules[i].atoms = atomIndices[i];
-        molecules[i].groups.resize(gpu->forces.size());
-    }
-    for (int i = 0; i < (int) gpu->forces.size(); i++)
-        for (int j = 0; j < gpu->forces[i]->getNumParticleGroups(); j++)
-        {
-            vector<int> particles;
-            gpu->forces[i]->getParticlesInGroup(j, particles);
-            molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
-        }
-    for (int i = 0; i < (int)constraints.size(); i++)
-    {
-        molecules[atomMolecule[constraints[i].atom1]].constraints.push_back(i);
-    }
-
-    // Sort them into groups of identical molecules.
-
-    vector<Molecule> uniqueMolecules;
-    vector<vector<int> > moleculeInstances;
-    for (int molIndex = 0; molIndex < (int)molecules.size(); molIndex++)
-    {
-        Molecule& mol = molecules[molIndex];
-
-        // See if it is identical to another molecule.
-
-        bool isNew = true;
-        for (int j = 0; j < (int)uniqueMolecules.size() && isNew; j++)
-        {
-            Molecule& mol2 = uniqueMolecules[j];
-            bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
-
-            // See if the atoms are identical.
-
-            int atomOffset = mol2.atoms[0]-mol.atoms[0];
-            float4* velm = gpu->psVelm4->_pSysData;
-            for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
-                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || velm[mol.atoms[i]].w != velm[mol2.atoms[i]].w)
-                    identical = false;
-                for (int k = 0; k < (int) gpu->forces.size(); k++)
-                    if (!gpu->forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
-                        identical = false;
-            }
-
-            // See if the constraints are identical.
-
-            for (int i = 0; i < (int) mol.constraints.size() && identical; i++)
-                if (constraints[mol.constraints[i]].atom1 != constraints[mol2.constraints[i]].atom1-atomOffset ||
-                        constraints[mol.constraints[i]].atom2 != constraints[mol2.constraints[i]].atom2-atomOffset ||
-                        constraints[mol.constraints[i]].distance2 != constraints[mol2.constraints[i]].distance2)
-                    identical = false;
-
-            // See if the force groups are identical.
-
-            for (int i = 0; i < (int) gpu->forces.size() && identical; i++)
-            {
-                if (mol.groups[i].size() != mol2.groups[i].size())
-                    identical = false;
-                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
-                    if (!gpu->forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
-                        identical = false;
-            }
-            if (identical)
-            {
-                moleculeInstances[j].push_back(mol.atoms[0]);
-                isNew = false;
-            }
-        }
-        if (isNew)
-        {
-            uniqueMolecules.push_back(mol);
-            moleculeInstances.push_back(vector<int>());
-            moleculeInstances[moleculeInstances.size()-1].push_back(mol.atoms[0]);
-        }
-    }
-    gpu->moleculeGroups.resize(moleculeInstances.size());
-    for (int i = 0; i < (int)moleculeInstances.size(); i++)
-    {
-        gpu->moleculeGroups[i].instances = moleculeInstances[i];
-        vector<int>& atoms = uniqueMolecules[i].atoms;
-        gpu->moleculeGroups[i].atoms.resize(atoms.size());
-        for (int j = 0; j < (int)atoms.size(); j++)
-            gpu->moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
-    }
-}
-
-extern "C"
-void gpuReorderAtoms(gpuContext gpu)
-{
-    if (gpu->natoms == 0 || gpu->sim.nonbondedCutoffSqr == 0.0)
-        return;
-    if (gpu->moleculeGroups.size() == 0)
-        findMoleculeGroups(gpu);
-
-    // Find the range of positions and the number of bins along each axis.
-
-    int numAtoms = gpu->natoms;
-    gpu->psPosq4->Download();
-    gpu->psVelm4->Download();
-    float4* posq = gpu->psPosq4->_pSysData;
-    float4* velm = gpu->psVelm4->_pSysData;
-    float minx = posq[0].x, maxx = posq[0].x;
-    float miny = posq[0].y, maxy = posq[0].y;
-    float minz = posq[0].z, maxz = posq[0].z;
-    if (gpu->sim.nonbondedMethod == PERIODIC || gpu->sim.nonbondedMethod == EWALD || gpu->sim.nonbondedMethod == PARTICLE_MESH_EWALD)
-    {
-        minx = miny = minz = 0.0;
-        maxx = gpu->sim.periodicBoxSizeX;
-        maxy = gpu->sim.periodicBoxSizeY;
-        maxz = gpu->sim.periodicBoxSizeZ;
-    }
-    else
-    {
-        for (int i = 1; i < numAtoms; i++)
-        {
-            minx = min(minx, posq[i].x);
-            maxx = max(maxx, posq[i].x);
-            miny = min(miny, posq[i].y);
-            maxy = max(maxy, posq[i].y);
-            minz = min(minz, posq[i].z);
-            maxz = max(maxz, posq[i].z);
-        }
-    }
-
-    // Loop over each group of identical molecules and reorder them.
-
-    vector<int> originalIndex(numAtoms);
-    vector<float4> newPosq(numAtoms);
-    vector<float4> newVelm(numAtoms);
-    vector<int3> newCellOffsets(numAtoms);
-    for (int group = 0; group < (int)gpu->moleculeGroups.size(); group++)
-    {
-        // Find the center of each molecule.
-
-        gpuMoleculeGroup& mol = gpu->moleculeGroups[group];
-        int numMolecules = mol.instances.size();
-        vector<int>& atoms = mol.atoms;
-        vector<float3> molPos(numMolecules);
-        for (int i = 0; i < numMolecules; i++)
-        {
-            molPos[i].x = 0.0f;
-            molPos[i].y = 0.0f;
-            molPos[i].z = 0.0f;
-            for (int j = 0; j < (int)atoms.size(); j++)
-            {
-                int atom = atoms[j]+mol.instances[i];
-                molPos[i].x += posq[atom].x;
-                molPos[i].y += posq[atom].y;
-                molPos[i].z += posq[atom].z;
-            }
-            molPos[i].x /= atoms.size();
-            molPos[i].y /= atoms.size();
-            molPos[i].z /= atoms.size();
-        }
-        if (gpu->sim.nonbondedMethod == PERIODIC || gpu->sim.nonbondedMethod == EWALD || gpu->sim.nonbondedMethod == PARTICLE_MESH_EWALD)
-        {
-            // Move each molecule position into the same box.
-
-            for (int i = 0; i < numMolecules; i++)
-            {
-                int xcell = (int) floor(molPos[i].x/gpu->sim.periodicBoxSizeX);
-                int ycell = (int) floor(molPos[i].y/gpu->sim.periodicBoxSizeY);
-                int zcell = (int) floor(molPos[i].z/gpu->sim.periodicBoxSizeZ);
-                float dx = xcell*gpu->sim.periodicBoxSizeX;
-                float dy = ycell*gpu->sim.periodicBoxSizeY;
-                float dz = zcell*gpu->sim.periodicBoxSizeZ;
-                if (dx != 0.0f || dy != 0.0f || dz != 0.0f)
-                {
-                    molPos[i].x -= dx;
-                    molPos[i].y -= dy;
-                    molPos[i].z -= dz;
-                    for (int j = 0; j < (int)atoms.size(); j++)
-                    {
-                        int atom = atoms[j]+mol.instances[i];
-                        posq[atom].x -= dx;
-                        posq[atom].y -= dy;
-                        posq[atom].z -= dz;
-                        gpu->posCellOffsets[atom].x -= xcell;
-                        gpu->posCellOffsets[atom].y -= ycell;
-                        gpu->posCellOffsets[atom].z -= zcell;
-                    }
-                }
-            }
-        }
-
-        // Select a bin for each molecule, then sort them by bin.
-
-        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
-        float binWidth;
-        if (useHilbert)
-            binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
-        else
-            binWidth = (float)(0.2*sqrt(gpu->sim.nonbondedCutoffSqr));
-        int xbins = 1 + (int) ((maxx-minx)/binWidth);
-        int ybins = 1 + (int) ((maxy-miny)/binWidth);
-        vector<pair<int, int> > molBins(numMolecules);
-        bitmask_t coords[3];
-        for (int i = 0; i < numMolecules; i++)
-        {
-            int x = (int) ((molPos[i].x-minx)/binWidth);
-            int y = (int) ((molPos[i].y-miny)/binWidth);
-            int z = (int) ((molPos[i].z-minz)/binWidth);
-            int bin;
-            if (useHilbert)
-            {
-                coords[0] = x;
-                coords[1] = y;
-                coords[2] = z;
-                bin = (int) hilbert_c2i(3, 8, coords);
-            }
-            else
-            {
-                int yodd = y&1;
-                int zodd = z&1;
-                bin = z*xbins*ybins;
-                bin += (zodd ? ybins-y : y)*xbins;
-                bin += (yodd ? xbins-x : x);
-            }
-            molBins[i] = pair<int, int>(bin, i);
-        }
-        sort(molBins.begin(), molBins.end());
-
-        // Reorder the atoms.
-
-        for (int i = 0; i < numMolecules; i++)
-        {
-            for (int j = 0; j < (int)atoms.size(); j++)
-            {
-                int oldIndex = mol.instances[molBins[i].second]+atoms[j];
-                int newIndex = mol.instances[i]+atoms[j];
-                originalIndex[newIndex] = (*gpu->psAtomIndex)[oldIndex];
-                newPosq[newIndex] = posq[oldIndex];
-                newVelm[newIndex] = velm[oldIndex];
-                newCellOffsets[newIndex] = gpu->posCellOffsets[oldIndex];
-            }
-        }
-    }
-
-    // Update the streams.
-
-    for (int i = 0; i < numAtoms; i++) {
-        posq[i] = newPosq[i];
-        velm[i] = newVelm[i];
-        (*gpu->psAtomIndex)[i] = originalIndex[i];
-        gpu->posCellOffsets[i] = newCellOffsets[i];
-    }
-    gpu->psPosq4->Upload();
-    gpu->psVelm4->Upload();
-    gpu->psAtomIndex->Upload();
-}
--- a/platforms/cuda-old/src/kernels/gputypes.h
+++ b/platforms/cuda-old/src/kernels/gputypes.h
-#ifndef __GPUTYPES_H__
-#define __GPUTYPES_H__
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "cudatypes.h"
-#include "cudaCompact.h"
-#include <vector>
-#include "windowsExportCuda.h"
-
-namespace OpenMM {
-    class CudaForceInfo;
-}
-
-struct gpuAtomType {
-    std::string name;
-    char symbol;
-    float r;
-};
-
-struct gpuMoleculeGroup {
-    std::vector<int> atoms;
-    std::vector<int> instances;
-};
-
-struct gpuTabulatedFunction {
-    gpuTabulatedFunction() : coefficients(NULL) {
-    }
-    std::string name;
-    double min, max;
-    CUDAStream<float4>* coefficients;
-};
-
-enum SM_VERSION
-{
-    SM_10,
-    SM_11,
-    SM_12,
-    SM_20
-};
-
-
-/* Pointer to this structure will be given 
- * to gromacs functions*/
-struct _gpuContext {
-    
-    //Cache this here so that it doesn't
-    //have to be repeatedly passed around
-    int natoms;
-    int device;
-    bool useBlockingSync;
-    gpuAtomType* gpAtomTable;
-    int gAtomTypes;
-    unsigned int blocksPerSM;
-    unsigned int sharedMemoryPerBlock;
-    cudaGmxSimulation sim;
-    unsigned int* pOutputBufferCounter;
-    std::vector<OpenMM::CudaForceInfo*> forces;
-    std::vector<std::vector<int> > exclusions;
-    unsigned char* pAtomSymbol;
-    std::vector<gpuMoleculeGroup> moleculeGroups;
-    gpuTabulatedFunction tabulatedFunctions[MAX_TABULATED_FUNCTIONS];
-    std::vector<int3> posCellOffsets;
-    int iterations;
-    float epsfac;
-    float solventDielectric;
-    float soluteDielectric;
-    int grid;
-    bool bCalculateCM;
-    bool bRemoveCM;
-    bool bRecalculateBornRadii;
-    bool bOutputBufferPerWarp;
-    bool bIncludeGBSA;
-    bool bIncludeGBVI;
-    bool tabulatedFunctionsChanged;
-    unsigned long seed;
-    SM_VERSION sm_version;
-    compactionPlan compactPlan;
-    cufftHandle fftplan;
-    CUDAStream<float4>* psPosq4;
-    CUDAStream<float4>* psPosqP4;
-    CUDAStream<float4>* psOldPosq4;
-    CUDAStream<float4>* psVelm4;
-    CUDAStream<float4>* psForce4;
-    CUDAStream<float>*  psEnergy;           // Energy output buffer
-    CUDAStream<float2>* psSigEps2; 
-    CUDAStream<float4>* psCustomParams;     // Atom parameters for custom nonbonded force
-    CUDAStream<int4>* psCustomBondID;             // Atom indices for custom bonds
-    CUDAStream<float4>* psCustomBondParams;       // Parameters for custom bonds
-    CUDAStream<int4>* psCustomAngleID1;           // Atom indices for custom angles
-    CUDAStream<int2>* psCustomAngleID2;           // Atom indices for custom angles
-    CUDAStream<float4>* psCustomAngleParams;      // Parameters for custom angles
-    CUDAStream<int4>* psCustomTorsionID1;           // Atom indices for custom torsions
-    CUDAStream<int4>* psCustomTorsionID2;           // Atom indices for custom torsions
-    CUDAStream<float4>* psCustomTorsionParams;      // Parameters for custom torsions
-    CUDAStream<int>* psCustomExternalID;          // Atom indices for custom external force
-    CUDAStream<float4>* psCustomExternalParams;   // Parameters for custom external force
-    CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
-    CUDAStream<float2>* psEwaldCosSinSum;
-    CUDAStream<float>* psTabulatedErfc;     // Tabulated values for erfc()
-    CUDAStream<cufftComplex>* psPmeGrid;    // Grid points for particle mesh Ewald
-    CUDAStream<float>* psPmeBsplineModuli[3];
-    CUDAStream<float4>* psPmeBsplineTheta;
-    CUDAStream<float4>* psPmeBsplineDtheta;
-    CUDAStream<int>* psPmeAtomRange;           // The range of sorted atoms at each grid point
-    CUDAStream<int2>* psPmeAtomGridIndex;      // The grid point each atom is at
-    CUDAStream<float2>* psObcData;
-    CUDAStream<float4>* psGBVIData;
-    CUDAStream<float>* psGBVISwitchDerivative;
-    CUDAStream<float>* psObcChain;
-    CUDAStream<float>* psBornForce;
-    CUDAStream<float>* psBornRadii;
-    CUDAStream<float>* psBornSum;
-    CUDAStream<int4>* psBondID;
-    CUDAStream<float2>* psBondParameter;
-    CUDAStream<int4>* psBondAngleID1;
-    CUDAStream<int2>* psBondAngleID2;
-    CUDAStream<float2>* psBondAngleParameter;
-    CUDAStream<int4>* psDihedralID1;
-    CUDAStream<int4>* psDihedralID2;
-    CUDAStream<float4>* psDihedralParameter;
-    CUDAStream<int4>* psRbDihedralID1;
-    CUDAStream<int4>* psRbDihedralID2;
-    CUDAStream<float4>* psRbDihedralParameter1;
-    CUDAStream<float2>* psRbDihedralParameter2;
-    CUDAStream<int4>* psLJ14ID;
-    CUDAStream<float4>* psLJ14Parameter;
-    CUDAStream<int4>* psShakeID;
-    CUDAStream<float4>* psShakeParameter;
-    CUDAStream<int4>* psSettleID;
-    CUDAStream<float2>* psSettleParameter;
-    CUDAStream<unsigned int>* psExclusion;
-    CUDAStream<unsigned int>* psExclusionIndex;
-    CUDAStream<unsigned int>* psWorkUnit;
-    CUDAStream<unsigned int>* psInteractingWorkUnit;
-    CUDAStream<unsigned int>* psInteractionFlag;
-    CUDAStream<size_t>* psInteractionCount;
-    CUDAStream<float2>* psStepSize;         // The size of the previous and current time steps
-    CUDAStream<float>* psLangevinParameters;// Parameters used for Langevin integration
-    CUDAStream<float4>* psRandom4;          // Pointer to sets of 4 random numbers for MD integration
-    CUDAStream<float2>* psRandom2;          // Pointer to sets of 2 random numbers for MD integration
-    CUDAStream<uint4>* psRandomSeed;        // Pointer to each random seed
-    CUDAStream<int>* psRandomPosition;      // Pointer to random number positions
-    CUDAStream<float4>* psLinearMomentum;   // Pointer to total linear momentum per CTA
-    CUDAStream<int>* psAtomIndex;           // The original index of each atom
-    CUDAStream<float4>* psGridBoundingBox;  // The size of each grid cell
-    CUDAStream<float4>* psGridCenter;       // The center and radius for each grid cell
-    CUDAStream<int2>* psCcmaAtoms;          // The atoms connected by each CCMA constraint
-    CUDAStream<float4>* psCcmaDistance;     // The displacement vector (x, y, z) and constraint distance (w) for each CCMA constraint
-    CUDAStream<int>* psCcmaAtomConstraints; // The indices of constraints involving each atom
-    CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom
-    CUDAStream<float>* psCcmaDelta1;        // Workspace for CCMA
-    CUDAStream<float>* psCcmaDelta2;        // Workspace for CCMA
-    int* ccmaConvergedHostMarker;           // Host memory used to communicate that CCMA has converged
-    cudaEvent_t ccmaEvent;                  // Used to optimize communication during CCMA
-    CUDAStream<float>* psCcmaReducedMass;   // The reduced mass for each CCMA constraint
-    CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
-    CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
-    CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
-    CUDAStream<unsigned int>* psConstraintMatrixColumn; // The column of each element in the constraint matrix.
-    CUDAStream<float>* psConstraintMatrixValue; // The value of each element in the constraint matrix.
-};
-
-typedef struct _gpuContext *gpuContext;
-
-
-// Function prototypes
-extern "C"
-bool OPENMMCUDA_EXPORT gpuIsAvailable();
-
-extern "C"
-void gpuSetBondParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& length, const std::vector<float>& k);
-
-extern "C"
-void gpuSetBondAngleParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
-        const std::vector<float>& angle, const std::vector<float>& k);
-
-extern "C"
-void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
-        const std::vector<float>& k, const std::vector<float>& phase, const std::vector<int>& periodicity);
-
-extern "C"
-void gpuSetRbDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
-        const std::vector<float>& c0, const std::vector<float>& c1, const std::vector<float>& c2, const std::vector<float>& c3, const std::vector<float>& c4, const std::vector<float>& c5);
-
-extern "C"
-void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
-        const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
-
-extern "C"
-void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
-        const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
-
-extern "C"
-void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
-
-extern "C"
-void gpuSetTabulatedFunction(gpuContext gpu, int index, const std::string& name, const std::vector<double>& values, double min, double max);
-
-extern "C"
-void gpuSetCustomBondParameters(gpuContext gpu, const std::vector<int>& bondAtom1, const std::vector<int>& bondAtom2, const std::vector<std::vector<double> >& bondParams,
-            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
-
-extern "C"
-void gpuSetCustomAngleParameters(gpuContext gpu, const std::vector<int>& angleAtom1, const std::vector<int>& angleAtom2, const std::vector<int>& angleAtom3, const std::vector<std::vector<double> >& angleParams,
-            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
-
-extern "C"
-void gpuSetCustomTorsionParameters(gpuContext gpu, const std::vector<int>& torsionAtom1, const std::vector<int>& torsionAtom2, const std::vector<int>& torsionAtom3, const std::vector<int>& torsionAtom4, const std::vector<std::vector<double> >& torsionParams,
-            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
-
-extern "C"
-void gpuSetCustomExternalParameters(gpuContext gpu, const std::vector<int>& atomIndex, const std::vector<std::vector<double> >& atomParams,
-            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
-
-extern "C"
-void gpuSetCustomNonbondedParameters(gpuContext gpu, const std::vector<std::vector<double> >& parameters, const std::vector<std::vector<int> >& exclusions,
-            CudaNonbondedMethod method, float cutoffDistance, const std::string& energyExp,
-            const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
-
-extern "C"
-void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz);
-
-extern "C"
-void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);
-
-extern "C"
-void OPENMMCUDA_EXPORT gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
-
-extern "C"
-void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge);
-
-extern "C" 
-void gpuSetGBVIParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius,
-                          const std::vector<float>& gammas, const std::vector<float>& scaledRadii,
-                          int bornRadiusScalingMethod, float quinticLowerLimitFactor, float quinticUpperBornRadiusLimit);
-
-extern "C"
-void gpuSetConstraintParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
-        const std::vector<float>& invMass1, const std::vector<float>& invMass2, float constraintTolerance);
-
-extern "C"
-int gpuAllocateInitialBuffers(gpuContext gpu);
-
-extern "C"
-void gpuSetPositions(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
-
-extern "C"
-void gpuSetVelocities(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
-
-extern "C"
-void gpuSetMass(gpuContext gpu, const std::vector<float>& mass);
-
-extern "C"
-void OPENMMCUDA_EXPORT gpuInitializeRandoms(gpuContext gpu);
-
-extern "C"
-OPENMMCUDA_EXPORT void* gpuInit(int numAtoms, unsigned int device = 0, bool useBlockingSync = false);
-
-extern "C"
-void gpuSetLangevinIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature, float errorTol);
-
-extern "C"
-void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT, float errorTol);
-
-extern "C"
-void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
-
-extern "C"
-void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionFrequency);
-
-extern "C"
-void gpuShutDown(gpuContext gpu);
-
-extern "C"
-int gpuBuildOutputBuffers(gpuContext gpu);
-
-extern "C"
-int gpuBuildThreadBlockWorkList(gpuContext gpu);
-
-extern "C"
-void OPENMMCUDA_EXPORT gpuBuildExclusionList(gpuContext gpu);
-
-extern "C"
-int OPENMMCUDA_EXPORT gpuSetConstants(gpuContext gpu);
-
-extern "C"
-void gpuReorderAtoms(gpuContext gpu);
-
-extern "C"
-void OPENMMCUDA_EXPORT setExclusions(gpuContext gpu, const std::vector<std::vector<int> >& exclusions);
-
-#endif //__GPUTYPES_H__