Deleted the old CUDA platform

2e451b9d · Peter Eastman · 352e2fc7 · 352e2fc7 · 352e2fc7 · 352e2fc7
Commit 2e451b9d authored Dec 13, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda-old/CMakeLists.txt
+++ b/platforms/cuda-old/CMakeLists.txt
-#---------------------------------------------------
-# OpenMM CUDA Platform
-#
-# Creates OpenMM library, base name=OpenMMCuda.
-# Default libraries are shared & optimized. Variants
-# are created for static (_static) and debug (_d).
-#
-# Windows:
-#   OpenMMCuda[_d].dll
-#   OpenMMCuda[_d].lib
-#   OpenMMCuda_static[_d].lib
-# Unix:
-#   libOpenMMCuda[_d].so
-#   libOpenMMCuda_static[_d].a
-#----------------------------------------------------
-
-set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
-if(OPENMM_BUILD_CUDA_TESTS)
-    SUBDIRS (tests)
-endif(OPENMM_BUILD_CUDA_TESTS)
-
-# The source is organized into subdirectories, but we handle them all from
-# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-SET(OPENMM_SOURCE_SUBDIRS .)
-
-
-# Collect up information about the version of the OpenMM library we're building
-# and make it available to the code so it can be built into the binaries.
-
-SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
-
-SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
-SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
-
-
-# Ensure that debug libraries have "_d" appended to their names.
-# CMake gets this right on Windows automatically with this definition.
-IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
-    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
-ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
-
-# But on Unix or Cygwin we have to add the suffix manually
-IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-    SET(SHARED_TARGET ${SHARED_TARGET}_d)
-    SET(STATIC_TARGET ${STATIC_TARGET}_d)
-ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-
-
-# These are all the places to search for header files which are
-# to be part of the API.
-SET(API_INCLUDE_DIRS) # start empty
-FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
-    # append
-    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
-                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include 
-                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
-ENDFOREACH(subdir)
-
-# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
-# and absolute pathnames.
-SET(API_REL_INCLUDE_FILES)   # start these out empty
-SET(API_ABS_INCLUDE_FILES)
-
-FOREACH(dir ${API_INCLUDE_DIRS})
-    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
-    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
-
-    FOREACH(pathname ${fullpaths})
-        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
-        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
-    ENDFOREACH(pathname)
-ENDFOREACH(dir)
-
-# collect up source files
-SET(SOURCE_FILES) # empty
-SET(SOURCE_INCLUDE_FILES)
-
-FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
-    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
-    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
-    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
-    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
-    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
-ENDFOREACH(subdir)
-
-INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-
-# SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
-
-SUBDIRS (sharedTarget)
--- a/platforms/cuda-old/include/CudaKernelFactory.h
+++ b/platforms/cuda-old/include/CudaKernelFactory.h
-#ifndef OPENMM_CUDAKERNELFACTORY_H_
-#define OPENMM_CUDAKERNELFACTORY_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "openmm/KernelFactory.h"
-#include "windowsExportCuda.h"
-
-namespace OpenMM {
-
-/**
- * This KernelFactory creates all kernels for CudaPlatform.
- */
-
-class CudaKernelFactory : public KernelFactory {
-public:
-    OPENMMCUDA_EXPORT KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
-};
-
-} // namespace OpenMM
-
-#endif /*OPENMM_CUDAKERNELFACTORY_H_*/
--- a/platforms/cuda-old/include/CudaPlatform.h
+++ b/platforms/cuda-old/include/CudaPlatform.h
-#ifndef OPENMM_CUDAPLATFORM_H_
-#define OPENMM_CUDAPLATFORM_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "openmm/Platform.h"
-#include "windowsExportCuda.h"
-
-struct _gpuContext;
-
-namespace OpenMM {
-
-/**
- * This Platform subclass uses CUDA implementations of the OpenMM kernels to run on NVidia GPUs.
- */
-
-class OPENMMCUDA_EXPORT CudaPlatform : public Platform {
-public:
-    class PlatformData;
-    CudaPlatform();
-    const std::string& getName() const {
-        static const std::string name = "Cuda";
-        return name;
-    }
-    double getSpeed() const {
-        return 50;
-    }
-    bool supportsDoublePrecision() const;
-    const std::string& getPropertyValue(const Context& context, const std::string& property) const;
-    void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
-    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
-    void contextDestroyed(ContextImpl& context) const;
-    /**
-     * This is the name of the parameter for selecting which CUDA device to use.
-     */
-    static const std::string& CudaDevice() {
-        static const std::string key = "CudaDevice";
-        return key;
-    }
-    /**
-     * This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
-     */
-    static const std::string& CudaUseBlockingSync() {
-        static const std::string key = "CudaUseBlockingSync";
-        return key;
-    }
-};
-
-class CudaPlatform::PlatformData {
-public:
-    OPENMMCUDA_EXPORT PlatformData(_gpuContext* gpu);
-    _gpuContext* gpu;
-    bool removeCM;
-    bool hasBonds, hasAngles, hasPeriodicTorsions, hasRB, hasNonbonded, hasCustomNonbonded;
-    int nonbondedMethod, customNonbondedMethod;
-    int cmMotionFrequency;
-    int stepCount, computeForceCount;
-    double time, ewaldSelfEnergy, dispersionCoefficient;
-    std::map<std::string, std::string> propertyValues;
-};
-
-} // namespace OpenMM
-
-#endif /*OPENMM_CUDAPLATFORM_H_*/
--- a/platforms/cuda-old/include/windowsExportCuda.h
+++ b/platforms/cuda-old/include/windowsExportCuda.h
-#ifndef OPENMM_WINDOWSEXPORTCUDA_H_
-#define OPENMM_WINDOWSEXPORTCUDA_H_
-
-/*
- * Shared libraries are messy in Visual Studio. We have to distinguish three
- * cases:
- *   (1) this header is being used to build the OpenMM shared library
- *       (dllexport)
- *   (2) this header is being used by a *client* of the OpenMM shared
- *       library (dllimport)
- *   (3) we are building the OpenMM static library, or the client is
- *       being compiled with the expectation of linking with the
- *       OpenMM static library (nothing special needed)
- * In the CMake script for building this library, we define one of the symbols
- *     OpenMMCUDA_BUILDING_{SHARED|STATIC}_LIBRARY
- * Client code normally has no special symbol defined, in which case we'll
- * assume it wants to use the shared library. However, if the client defines
- * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
- * that the client code can be linked with static libraries. Note that
- * the client symbol is not library dependent, while the library symbols
- * affect only the OpenMM library, meaning that other libraries can
- * be clients of this one. However, we are assuming all-static or all-shared.
- */
-
-#ifdef _MSC_VER
-    // We don't want to hear about how sprintf is "unsafe".
-    #pragma warning(disable:4996)
-    // Keep MS VC++ quiet about lack of dll export of private members.
-    #pragma warning(disable:4251)
-    #if defined(OPENMMCUDA_BUILDING_SHARED_LIBRARY)
-        #define OPENMMCUDA_EXPORT __declspec(dllexport)
-    #elif defined(OPENMMCUDA_BUILDING_STATIC_LIBRARY) || defined(OPENMMCUDA_USE_STATIC_LIBRARIES)
-		#define OPENMMCUDA_EXPORT
-    #else
-		#define OPENMMCUDA_EXPORT __declspec(dllimport)   // i.e., a client of a shared library
-    #endif
-#else
-    #define OPENMMCUDA_EXPORT // Linux, Mac
-#endif
-
-#endif // OPENMM_WINDOWSEXPORTCUDA_H_
--- a/platforms/cuda-old/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda-old/sharedTarget/CMakeLists.txt
-#
-# Include CUDA related files.
-#
-# INCLUDE(${FINDCUDA_DIR}/FindCuda.cmake)
-INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
-LINK_DIRECTORIES(${CUDA_TARGET_LINK})
-FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
-    FILE(GLOB src_files ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src/*.cu  ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src/*/*.cu)
-    SET(SOURCE_FILES ${SOURCE_FILES} ${src_files})
-    CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include)
-    CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src)
-ENDFOREACH(subdir)
-CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
-CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/openmmapi/include)
-
-IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
-ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
-ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-
-IF(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
-    # NVCC doesn't know how to build universal binaries, so we need to build two separate versions.
-
-    SET(BASE_FLAGS ${CUDA_NVCC_FLAGS})
-    SET(CMAKE_OSX_ARCHITECTURES i386)
-    SET(CUDA_NVCC_FLAGS ${BASE_FLAGS} -m32)
-    CUDA_ADD_LIBRARY("${SHARED_TARGET}32" SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET}32 ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
-    SET_TARGET_PROPERTIES(${SHARED_TARGET}32 PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
-    SET(CMAKE_OSX_ARCHITECTURES x86_64)
-    SET(CUDA_NVCC_FLAGS ${BASE_FLAGS} -m64)
-    CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
-    ADD_DEPENDENCIES(${SHARED_TARGET} "${SHARED_TARGET}32")
-
-    # Join them into a single universal binary.
-
-    ADD_CUSTOM_COMMAND(
-        TARGET ${SHARED_TARGET}
-        POST_BUILD
-        COMMAND /usr/bin/lipo lib${SHARED_TARGET}.dylib lib${SHARED_TARGET}32.dylib -create -output lib${SHARED_TARGET}.dylib
-        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-        COMMENT "Creating universal binary")
-ELSE(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
-    CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
-ENDIF(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
-
-INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cuda-old/src/CudaForceInfo.cpp
+++ b/platforms/cuda-old/src/CudaForceInfo.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaForceInfo.h"
-
-using namespace OpenMM;
-using namespace std;
-
-bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
-    return true;
-}
-
-int CudaForceInfo::getNumParticleGroups() {
-    return 0;
-}
-
-void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
-    return;
-}
-
-bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
-    return true;
-}
--- a/platforms/cuda-old/src/CudaForceInfo.h
+++ b/platforms/cuda-old/src/CudaForceInfo.h
-#ifndef OPENMM_CUDAFORCEINFO_H_
-#define OPENMM_CUDAFORCEINFO_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "openmm/internal/windowsExport.h"
-#include <vector>
-
-namespace OpenMM {
-
-/**
- * This class is used by the Cuda implementation of a Force class to convey information
- * about the behavior and requirements of that force.
- */
-
-class CudaForceInfo {
-public:
-    CudaForceInfo() {
-    }
-    virtual ~CudaForceInfo() {
-    }
-    /**
-     * Get whether or not two particles have identical force field parameters.
-     */
-    virtual OPENMM_EXPORT bool areParticlesIdentical(int particle1, int particle2);
-    /**
-     * Get the number of particle groups defined by this force.
-     */
-    virtual OPENMM_EXPORT int getNumParticleGroups();
-    /**
-     * Get the list of particles in a particular group.
-     */
-    virtual OPENMM_EXPORT void getParticlesInGroup(int index, std::vector<int>& particles);
-    /**
-     * Get whether two particle groups are identical.
-     */
-    virtual OPENMM_EXPORT bool areGroupsIdentical(int group1, int group2);
-};
-
-} // namespace OpenMM
-
-#endif /*OPENMM_CUDAFORCEINFO_H_*/
--- a/platforms/cuda-old/src/CudaKernelFactory.cpp
+++ b/platforms/cuda-old/src/CudaKernelFactory.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaKernelFactory.h"
-#include "CudaKernels.h"
-#include "openmm/internal/ContextImpl.h"
-#include "openmm/OpenMMException.h"
-
-using namespace OpenMM;
-
-OPENMMCUDA_EXPORT KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
-    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
-    if (name == CalcForcesAndEnergyKernel::Name())
-        return new CudaCalcForcesAndEnergyKernel(name, platform, data);
-    if (name == UpdateStateDataKernel::Name())
-        return new CudaUpdateStateDataKernel(name, platform, data);
-    if (name == ApplyConstraintsKernel::Name())
-        return new CudaApplyConstraintsKernel(name, platform, data);
-    if (name == VirtualSitesKernel::Name())
-        return new CudaVirtualSitesKernel(name, platform);
-    if (name == CalcHarmonicBondForceKernel::Name())
-        return new CudaCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCustomBondForceKernel::Name())
-        return new CudaCalcCustomBondForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcHarmonicAngleForceKernel::Name())
-        return new CudaCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCustomAngleForceKernel::Name())
-        return new CudaCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcPeriodicTorsionForceKernel::Name())
-        return new CudaCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcRBTorsionForceKernel::Name())
-        return new CudaCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCMAPTorsionForceKernel::Name())
-        return new CudaCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCustomTorsionForceKernel::Name())
-        return new CudaCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcNonbondedForceKernel::Name())
-        return new CudaCalcNonbondedForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcCustomNonbondedForceKernel::Name())
-        return new CudaCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
-    if (name == CalcGBSAOBCForceKernel::Name())
-        return new CudaCalcGBSAOBCForceKernel(name, platform, data);
-    if (name == CalcGBVIForceKernel::Name())
-        return new CudaCalcGBVIForceKernel(name, platform, data);
-    if (name == CalcCustomExternalForceKernel::Name())
-        return new CudaCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
-    if (name == IntegrateVerletStepKernel::Name())
-        return new CudaIntegrateVerletStepKernel(name, platform, data);
-    if (name == IntegrateLangevinStepKernel::Name())
-        return new CudaIntegrateLangevinStepKernel(name, platform, data);
-    if (name == IntegrateBrownianStepKernel::Name())
-        return new CudaIntegrateBrownianStepKernel(name, platform, data);
-    if (name == IntegrateVariableVerletStepKernel::Name())
-        return new CudaIntegrateVariableVerletStepKernel(name, platform, data);
-    if (name == IntegrateVariableLangevinStepKernel::Name())
-        return new CudaIntegrateVariableLangevinStepKernel(name, platform, data);
-    if (name == ApplyAndersenThermostatKernel::Name())
-        return new CudaApplyAndersenThermostatKernel(name, platform, data);
-    if (name == ApplyMonteCarloBarostatKernel::Name())
-        return new CudaApplyMonteCarloBarostatKernel(name, platform, data);
-    if (name == CalcKineticEnergyKernel::Name())
-        return new CudaCalcKineticEnergyKernel(name, platform, data);
-    if (name == RemoveCMMotionKernel::Name())
-        return new CudaRemoveCMMotionKernel(name, platform, data);
-    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
-}
--- a/platforms/cuda-old/src/CudaKernels.cpp
+++ b/platforms/cuda-old/src/CudaKernels.cpp
--- a/platforms/cuda-old/src/CudaKernels.h
+++ b/platforms/cuda-old/src/CudaKernels.h
--- a/platforms/cuda-old/src/CudaPlatform.cpp
+++ b/platforms/cuda-old/src/CudaPlatform.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaPlatform.h"
-#include "CudaKernelFactory.h"
-#include "CudaKernels.h"
-#include "openmm/internal/ContextImpl.h"
-#include "kernels/gputypes.h"
-#include "openmm/Context.h"
-#include "openmm/OpenMMException.h"
-#include "openmm/System.h"
-#include <sstream>
-
-using namespace OpenMM;
-using std::map;
-using std::string;
-using std::stringstream;
-
-extern "C" OPENMMCUDA_EXPORT void registerPlatforms() {
-    if (gpuIsAvailable())
-        Platform::registerPlatform(new CudaPlatform());
-}
-
-CudaPlatform::CudaPlatform() {
-    CudaKernelFactory* factory = new CudaKernelFactory();
-    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
-    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
-    registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
-    registerKernelFactory(VirtualSitesKernel::Name(), factory);
-    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
-    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
-    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
-    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
-    registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
-    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
-    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
-    registerKernelFactory(CalcGBVIForceKernel::Name(), factory);
-    registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
-    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
-    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
-    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
-    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
-    registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
-    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
-    platformProperties.push_back(CudaDevice());
-    platformProperties.push_back(CudaUseBlockingSync());
-    setPropertyDefaultValue(CudaDevice(), "0");
-    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
-}
-
-bool CudaPlatform::supportsDoublePrecision() const {
-    return false;
-}
-
-const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
-    const ContextImpl& impl = getContextImpl(context);
-    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
-    map<string, string>::const_iterator value = data->propertyValues.find(property);
-    if (value != data->propertyValues.end())
-        return value->second;
-    return Platform::getPropertyValue(context, property);
-}
-
-void CudaPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
-}
-
-void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
-    System& system = context.getSystem();
-    for (int i = 0; i < system.getNumParticles(); i++)
-        if (system.isVirtualSite(i))
-            throw OpenMMException("CudaPlatform does not support virtual sites");
-    for (int i = 0; i < system.getNumForces(); i++)
-        if (system.getForce(i).getForceGroup() != 0)
-            throw OpenMMException("CudaPlatform does not support force groups");
-    unsigned int device = 0;
-    const string& devicePropValue = (properties.find(CudaDevice()) == properties.end() ?
-            getPropertyDefaultValue(CudaDevice()) : properties.find(CudaDevice())->second);
-    if (devicePropValue.length() > 0)
-        stringstream(devicePropValue) >> device;
-    int numParticles = context.getSystem().getNumParticles();
-    const string& blockingSync = (properties.find(CudaUseBlockingSync()) == properties.end() ?
-            getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
-    _gpuContext* gpu = (_gpuContext*) gpuInit(numParticles, device, blockingSync == "true");
-    context.setPlatformData(new PlatformData(gpu));
-}
-
-void CudaPlatform::contextDestroyed(ContextImpl& context) const {
-    PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
-    gpuShutDown(data->gpu);
-    delete data;
-}
-
-CudaPlatform::PlatformData::PlatformData(_gpuContext* gpu) : gpu(gpu), removeCM(false), nonbondedMethod(0), customNonbondedMethod(0), hasBonds(false), hasAngles(false),
-        hasPeriodicTorsions(false), hasRB(false), hasNonbonded(false), hasCustomNonbonded(false), stepCount(0), computeForceCount(0), time(0.0),
-        ewaldSelfEnergy(0.0), dispersionCoefficient(0.0) {
-    stringstream device;
-    device << gpu->device;
-    propertyValues[CudaPlatform::CudaDevice()] = device.str();
-    propertyValues[CudaPlatform::CudaUseBlockingSync()] = (gpu->useBlockingSync ? "true" : "false");
-}
--- a/platforms/cuda-old/src/kernels/bbsort.cu
+++ b/platforms/cuda-old/src/kernels/bbsort.cu
-/*
- * Authored by: Chen, Shifu
- * 
- * Email: chen@gmtk.org
- *
- * Website: http://www.gmtk.org/gsort
- *
- * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
- * 
- */
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "vector_types.h"
-#include "bbsort.h"
-#include "bbsort_kernel.cu"
-
-
-int getValue(int2 v){
-	return v.y;
-}
-
-template <typename T>
-T getValue(T v){
-	return v;
-}
-
-#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
-    cudaError err = call;                                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
-
-bool assignSliceToBuckets(unsigned int* sliceCount,int sliceSize,unsigned int* bucketOffset,unsigned int* bucketOfSlice,unsigned int* bucketSizes,unsigned int* sliceOffsetInBucket,int& bucketsCount,float step)
-{
-	int i=0;
-
-	bool overflow=false;
-
-	int tmpSum=0;
-
-	bucketOffset[0]=0;
-
-	for(i=0;i<sliceSize; i++){
-		if(sliceCount[i] >BLOCK_SIZE)
-		{
-			overflow=true;
-		}
-
-		tmpSum += sliceCount[i];
-		bucketOfSlice[i]=bucketsCount;
-		bucketSizes[bucketsCount] = tmpSum;
-		sliceOffsetInBucket[i]=tmpSum -sliceCount[i];
-		if(tmpSum > BLOCK_SIZE )
-		{	
-			if(i != 0)
-			{
-				bucketOfSlice[i]=bucketsCount+1;
-				bucketSizes[bucketsCount] -= sliceCount[i];
-				sliceOffsetInBucket[i]=0;
-				bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum -  sliceCount[i];
-
-				bucketsCount++;
-				tmpSum=sliceCount[i];
-				bucketSizes[bucketsCount] = tmpSum;
-			}
-			else 
-			{
-				bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum ;
-				sliceOffsetInBucket[i]=0;
-				tmpSum=0;
-				bucketsCount++;
-			}
-		}
-
-	}
-	bucketsCount++;
-
-	return overflow;
-
-}
-
-template <typename T>
-void reduceMinMax(T* dData,int size,float& result,bool isMax)
-{
-
-	int step;
-	step=(size%2==0)?
-		(size/2):(size/2 +1);
-	int blockSize=BLOCK_SIZE;
-	int blockCount;
-	int length=size;
-	T originalResult;
-	while(step > 0)
-	{
-		if(step%BLOCK_SIZE==0)
-			blockCount=step/BLOCK_SIZE;
-		else 
-			blockCount=step/BLOCK_SIZE+1;
-
-		if(isMax)
-			reduceMaxD<<<blockCount,blockSize>>>(dData,step,length);
-		else 
-			reduceMinD<<<blockCount,blockSize>>>(dData,step,length);
-
-		length=step;
-
-		step=(step%2==0 || step==1)?(step/2):(step/2 +1);
-	}
-
-	CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost));
-
-	result=(int)getValue(originalResult);
-}
-
-template <typename T>
-void evaluateDisorder(T* dData,int size,float maxValue, float minValue, int& listOrder)
-{
-	int blockCount;
-
-	if((size-1) % BLOCK_SIZE ==0)blockCount=size/BLOCK_SIZE;
-	else blockCount=size/BLOCK_SIZE+1;
-
-	float* dDiffData;
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dDiffData, sizeof(float) * size));
-
-	calDifferenceD<<<blockCount,BLOCK_SIZE,(BLOCK_SIZE)*sizeof(T)>>>(dData,dDiffData,size);
-
-	float sum=0;
-
-	int step;
-	step=(size%2==0)?
-		(size/2):(size/2 +1);
-
-	int blockSize=BLOCK_SIZE;
-
-	int length=size;
-
-	while(step > 0)
-	{
-
-		if(step%BLOCK_SIZE==0)
-			blockCount=step/BLOCK_SIZE;
-		else 
-			blockCount=step/BLOCK_SIZE+1;
-
-		reduceSumD<<<blockCount,blockSize>>>(dDiffData,step,length);
-
-		length=step;
-
-		step=(step%2==0 || step==1)?(step/2):(step/2 +1);
-	}
-
-	CUDA_SAFE_CALL(cudaMemcpy(&sum, dDiffData, sizeof(float), cudaMemcpyDeviceToHost));
-
-	if( sum < (maxValue - minValue) * size / 10)
-		listOrder=NEARLY_SORTED;
-	else 
-		listOrder=DISORDERLY;
-
-	CUDA_SAFE_CALL(cudaFree(dDiffData));
-}
-
-template <typename T>
-void bbSortBody(T* dData,int size,int listOrder/*,float sliceStep,int sliceSize, T* dTmpData, float minValue,float maxValue*/)
-{
-	float minValue,maxValue;
-	T*  dTmpData;
-
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dTmpData, sizeof(T) * size));
-	CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
-	reduceMinMax(dTmpData,size,maxValue,true);
-	CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
-	reduceMinMax(dTmpData,size,minValue,false);
-
-	if(minValue == maxValue)
-	{
-		CUDA_SAFE_CALL(cudaFree(dTmpData));
-		return ;
-	}
-
-	if(listOrder == AUTO_EVALUATE )
-	{
-		evaluateDisorder(dData,size,maxValue,minValue,listOrder);
-	}
-	
-	float sliceStep = (float) (50.0*((double)(maxValue-minValue)/(double)size));
-	int sliceSize = (int) ((maxValue-minValue)/sliceStep + 10);
-
-	int blockCount;
-
-	if(size%BLOCK_SIZE==0)blockCount=size/BLOCK_SIZE;
-	else blockCount=size/BLOCK_SIZE+1;
-
-	unsigned int* dSliceCounts;
-	unsigned int* dOffsetInSlice;
-
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dOffsetInSlice, sizeof(unsigned int) * size));
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceCounts, sizeof(unsigned int) * sliceSize));
-	CUDA_SAFE_CALL(cudaMemset(dSliceCounts,0, sizeof(int) * sliceSize));
-
-	if(listOrder == NEARLY_SORTED)
-	{
-		assignElementToSlicesNearlySortedD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize,blockCount);
-	}
-	else 
-		assignElementToSlicesD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize);
-	unsigned int* hSliceCounts=new unsigned int[sliceSize];
-	CUDA_SAFE_CALL(cudaMemcpy(hSliceCounts, dSliceCounts, sizeof(unsigned int) * sliceSize, cudaMemcpyDeviceToHost));
-
-	int looseBucketSize=size/100;
-
-	unsigned int* hBucketOffsets=new unsigned int[looseBucketSize];
-	unsigned int* hBucketSizes=new unsigned int[looseBucketSize];
-	unsigned int* hBucketOfSlices=new unsigned int[sliceSize];
-	unsigned int* hSliceOffsetInBucket=new unsigned int[sliceSize];
-	int bucketsCount=0;
-
-	memset(hBucketSizes,0,sizeof(int) * looseBucketSize);
-	memset(hSliceOffsetInBucket,0,sizeof(unsigned int) * sliceSize);
-
-	bool overflow;
-
-	overflow = assignSliceToBuckets(hSliceCounts,sliceSize,hBucketOffsets,hBucketOfSlices,hBucketSizes,hSliceOffsetInBucket,bucketsCount,sliceStep);
-
-	unsigned int* dBucketOffsets;
-	unsigned int* dBucketSizes;
-
-	unsigned int* dBucketOfSlices;
-	unsigned int* dSliceOffsetInBucket;
-
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOfSlices, sizeof(unsigned int) * sliceSize));
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceOffsetInBucket, sizeof(unsigned int) * sliceSize));
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOffsets, sizeof(unsigned int) * bucketsCount));
-	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketSizes, sizeof(unsigned int) * bucketsCount));
-
-
-	CUDA_SAFE_CALL(cudaMemcpy(dBucketOfSlices, hBucketOfSlices, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpy(dSliceOffsetInBucket, hSliceOffsetInBucket, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpy(dBucketOffsets, hBucketOffsets, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpy(dBucketSizes, hBucketSizes, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
-
-	cudaBindTexture(0,tBucketOffsets,dBucketOffsets);
-	cudaBindTexture(0,tBucketSizes,dBucketSizes);
-	cudaBindTexture(0,tBucketOfSlices,dBucketOfSlices);
-	cudaBindTexture(0,tSliceOffsetInBucket,dSliceOffsetInBucket);
-
-	assignElementToBucketD<<<blockCount, BLOCK_SIZE>>>(dData,dTmpData,size,dOffsetInSlice,minValue,sliceStep);
-
-	CUDA_SAFE_CALL( cudaThreadSynchronize() );
-
-	bitonicSortD<<<bucketsCount, BLOCK_SIZE, sizeof(T) * BLOCK_SIZE>>>(dTmpData);
-
-    CUDA_SAFE_CALL(cudaMemcpy(dData, dTmpData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
-	
-	if(overflow){
-		for(int i=0;i<bucketsCount;i++)
-		{
-			if(hBucketSizes[i] > BLOCK_SIZE)
-			{
-				bbSort(dData + hBucketOffsets[i],hBucketSizes[i],listOrder);
-			}
-		}
-	}
-	
-	delete hBucketOffsets;
-	delete hBucketOfSlices;
-	delete hSliceCounts;
-	delete hBucketSizes;
-	delete hSliceOffsetInBucket;
-
-	CUDA_SAFE_CALL(cudaFree(dOffsetInSlice));
-	CUDA_SAFE_CALL(cudaFree(dSliceCounts));
-	CUDA_SAFE_CALL(cudaFree(dTmpData));
-
-	cudaUnbindTexture( tBucketSizes );
-	CUDA_SAFE_CALL(cudaFree(dBucketSizes));
-
-	cudaUnbindTexture( tBucketOffsets );
-	CUDA_SAFE_CALL(cudaFree(dBucketOffsets));
-
-	cudaUnbindTexture( tBucketOfSlices );
-	CUDA_SAFE_CALL(cudaFree(dBucketOfSlices));
-
-	cudaUnbindTexture( tSliceOffsetInBucket );
-	CUDA_SAFE_CALL(cudaFree(dSliceOffsetInBucket));
-}
-
-/************************************************************************************
-
-Uncomment your desired function definition here
-
-Please note that, only one type of bbsort() can be used in a program, due to NVCC compiler doesn't support overriding kernel function
-
-float, double, int, uint, short, and ushort are originally supported, if you want to use bbsort() in double
-
-please follow the readme.txt
-
-Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile command) to sort doubles
-
-*************************************************************************************/
-
-template<>
-void OPENMMCUDA_EXPORT bbSort(int2* dData,int size,int listOrder)
-{
-
-	bbSortBody(dData,size,listOrder);
-}
-
-//void bbSort(float* dData,int size,int listOrder)
-//{
-//
-//	bbSortBody(dData,size,listOrder);
-//}
-
-//void bbSort(int* dData,int size,int listOrder)
-//{
-//
-//	bbSortBody(dData,size,listOrder);
-//}
-//
-//void bbSort(unsigned int* dData,int size,int listOrder)
-//{
-//
-//	bbSortBody(dData,size,listOrder);
-//}
-//
-//void bbSort(double* dData,int size,int listOrder)
-//{
-//
-//	bbSortBody(dData,size,listOrder);
-//}
--- a/platforms/cuda-old/src/kernels/bbsort.h
+++ b/platforms/cuda-old/src/kernels/bbsort.h
-/*
- * Authored by: Chen, Shifu
- * 
- * Email: chen@gmtk.org
- *
- * Website: http://www.gmtk.org/gsort
- *
- * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
- * 
- */
-#ifndef _BBSORT_H_
-#define _BBSORT_H_
-#include "windowsExportCuda.h"
-
-#define BLOCK_SIZE 512
-
-#define DISORDERLY 0
-#define NEARLY_SORTED 1
-#define AUTO_EVALUATE 2
-
-template <typename T>
-void OPENMMCUDA_EXPORT bbSort(T* dData,int number,int listOrder=AUTO_EVALUATE);
-
-#endif // _BBSORT_H_
--- a/platforms/cuda-old/src/kernels/bbsort_kernel.cu
+++ b/platforms/cuda-old/src/kernels/bbsort_kernel.cu
-/*
- * Authored by: Chen, Shifu
- * 
- * Email: chen@gmtk.org
- *
- * Website: http://www.gmtk.org/gsort
- *
- * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
- * 
- */
-#ifndef _BBSORT_KERNEL_H_
-#define _BBSORT_KERNEL_H_
-
-#include "bbsort.h"
-#include "math_constants.h"
-
-texture<unsigned int, 1, cudaReadModeElementType> tBucketSizes;
-texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
-texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices;
-texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket;
-
-static __device__ int dGetValue(int2 v){
-	return v.y;
-}
-
-template <typename T>
-static __device__ T dGetValue(T v){
-	return v;
-}
-
-
-static __device__ void dPad(int2& v){
-	v.x=0x3fffffff;
-	v.y=0x4fffffff;
-}
-
-template <typename T>
-static __device__ void dPad(T & v){
-	v=0x7fffffff;
-}
-
-template <typename T>
-__global__ static void reduceMaxD(T * dData,int step,int length)
-{
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if(index + step >=length)
-		return ;
-	dData[index] = dGetValue(dData[index])>dGetValue(dData[index+step])?dData[index]:dData[index+step];
-}
-
-template <typename T>
-__global__ static void reduceMinD(T * dData,int step,int length)
-{
-	
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if(index + step >=length)
-		return ;
-
-	dData[index] = dGetValue(dData[index])<dGetValue(dData[index+step])?dData[index]:dData[index+step];
-}
-
-__global__ static void reduceSumD(float * dDiffData,int step,int length)
-{
-
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if(index + step >=length)
-		return ;
-
-	dDiffData[index] += dDiffData[index+step];
-}
-
-template <typename T>
-__global__ static void calDifferenceD(T * dData,float * dDiffData,int size)
-{
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if(index > size-1)
-		return ;
-
-    const unsigned int tid = threadIdx.x;
-	
-	extern __shared__ T sData[];
-
-	sData[tid]=dData[index];
-
-	__syncthreads();
-	
-	if(tid < blockDim.x -1)
-		dDiffData[index] = abs(dGetValue(sData[tid+1]) - dGetValue(sData[tid]));
-	else 
-		dDiffData[index] =0;
-	
-}
-
-template <typename T>
-__device__ inline void dSwap(T & a, T & b)
-{
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-
-template <typename T>
-__global__ static void bitonicSortD(T * datas)
-{
-    extern __shared__ T shared[];
-
-	const unsigned int bid=blockIdx.x;
-
-    const unsigned int tid = threadIdx.x;
-
-	__shared__ unsigned int count;
-	__shared__ unsigned int offset;
-
-	if(tid == 0)
-	{
-		count=tex1Dfetch(tBucketSizes,bid);
-		offset=tex1Dfetch(tBucketOffsets,bid);
-	}
-
-	__syncthreads();
-
-    if(tid < count)
-		shared[tid] = datas[tid+offset];
-	else 
-	{
-		dPad(shared[tid]);
-	}
-
-    __syncthreads();
-
-    for (unsigned int k = 2; k <= BLOCK_SIZE; k *= 2)
-    {
-        for (unsigned int j = k / 2; j>0; j /= 2)
-        {
-            unsigned int ixj = tid ^ j;
-            
-
-            if (ixj > tid)
-            {
-                if ((tid & k) == 0)
-                {
-                    if (dGetValue(shared[tid]) > dGetValue(shared[ixj]))
-                    {
-                        dSwap(shared[tid], shared[ixj]);
-                    }
-                }
-                else
-                {
-                    if (dGetValue(shared[tid]) < dGetValue(shared[ixj]))
-                    {
-                        dSwap(shared[tid], shared[ixj]);
-                    }
-                }
-            }
-            
-            __syncthreads();
-        }
-    }
-    if(tid < count)
-		datas[tid+offset] = shared[tid];
-}
-
-template <typename T>
-
-__global__ void assignElementToSlicesD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize)
-{
-	unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-
-	if(index > number-1)
-		return ;
-
-	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
-
-	unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
-
-	dOffsetInSlice[index] = offset;
-
-}
-
-template <typename T>
-__global__ void assignElementToSlicesNearlySortedD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize,int blockCount)
-{
-	unsigned int index= blockIdx.x + blockCount * threadIdx.x;
-
-	if(index > number-1)
-		return ;
-
-	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
-
-	unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
-
-	dOffsetInSlice[index] = offset;
-
-}
-
-template <typename T>
-__global__ void assignElementToBucketD(T* dDatas,T*  dNewDatas,int number,unsigned int* dOffsetInSlice,float minValue,float step)
-{
-
-	unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-
-	if(index > number-1)
-		return ;
-
-	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
-
-	unsigned int b=tex1Dfetch(tBucketOfSlices,s);
-
-	unsigned int offset =tex1Dfetch(tBucketOffsets,b) + tex1Dfetch(tSliceOffsetInBucket,s) + dOffsetInSlice[index];
-
-	dNewDatas[offset] =dDatas[index];
-
-}
-
-#endif // _BBSORT_KERNEL_H_
--- a/platforms/cuda-old/src/kernels/cudaCompact.cu
+++ b/platforms/cuda-old/src/kernels/cudaCompact.cu
-
-/* Code for CUDA stream compaction. Roughly based on:
-    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
-        High Performance Graphics 2009.
-
-    Notes:
-        - paper recommends 128 threads/block, so this is hard coded.
-        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
-          complicated and performs poorly on current hardware
-        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
-          performance across most of the tested workloads in the paper. The selective variant is not
-          implemented.
-        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
-          manner. It is, however, done in a very easy to program manner, and integrated into the top of
-          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
-          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
-          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
-          dgBlockCounts in stages.
-
-  Date:         23 Aug 2009
-  Author:       Imran Haque (ihaque@cs.stanford.edu)
-  Affiliation:  Stanford University
-  License:      Public Domain
-*/
-
-#include "cudaCompact.h"
-
-typedef unsigned int T;
-
-// Phase 1: Count valid elements per thread block
-// Hard-code 128 thd/blk
-__device__ unsigned int sumReduce128(volatile unsigned int* arr) {
-    // Parallel reduce element counts
-    // Assumes 128 thd/block
-    if (threadIdx.x < 64) arr[threadIdx.x] += arr[threadIdx.x+64];
-    __syncthreads();
-    if (threadIdx.x < 32) {
-        arr[threadIdx.x] += arr[threadIdx.x+32];
-        if (threadIdx.x < 16) arr[threadIdx.x] += arr[threadIdx.x+16];
-        if (threadIdx.x < 8) arr[threadIdx.x] += arr[threadIdx.x+8];
-        if (threadIdx.x < 4) arr[threadIdx.x] += arr[threadIdx.x+4];
-        if (threadIdx.x < 2) arr[threadIdx.x] += arr[threadIdx.x+2];
-        if (threadIdx.x < 1) arr[threadIdx.x] += arr[threadIdx.x+1];
-    }
-    __syncthreads();
-    return arr[0];
-}
-
-__global__ void countElts(unsigned int* dgBlockCounts,const unsigned int* dgValid,const size_t eltsPerBlock,const size_t len) {
-    __shared__ volatile unsigned int dsCount[128];
-    dsCount[threadIdx.x] = 0;
-    size_t ub;
-    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
-    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
-        if ((base + threadIdx.x) < ub && dgValid[base+threadIdx.x])
-            dsCount[threadIdx.x]++;
-    }
-    __syncthreads();
-    unsigned int blockCount = sumReduce128(dsCount);
-    if (threadIdx.x == 0) dgBlockCounts[blockIdx.x] = blockCount;
-    return;
-}
-
-// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
-// Exclusive prefix scan over 128 elements
-// Assumes 128 threads
-// Taken from cuda SDK "scan" sample for naive scan, with small modifications
-__device__ int exclusivePrescan128(const unsigned int* in,unsigned int* outAndTemp) {
-    const int n=128;
-    //TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
-    //__shared__ int temp[2*n];
-    unsigned int* temp = outAndTemp;
-    int pout = 1, pin = 0;
-
-    // load input into temp
-    // This is exclusive scan, so shift right by one and set first elt to 0
-    temp[pout*n + threadIdx.x] = (threadIdx.x > 0) ? in[threadIdx.x-1] : 0;
-    __syncthreads();
-
-    for (int offset = 1; offset < n; offset *= 2)
-    {
-        pout = 1 - pout; // swap double buffer indices
-        pin  = 1 - pout;
-        __syncthreads();
-        temp[pout*n+threadIdx.x] = temp[pin*n+threadIdx.x];
-        if (threadIdx.x >= offset)
-            temp[pout*n+threadIdx.x] += temp[pin*n+threadIdx.x - offset];
-    }
-
-    //out[threadIdx.x] = temp[pout*n+threadIdx.x]; // write output
-    __syncthreads();
-    return outAndTemp[127]+in[127]; // Return sum of all elements
-}
-__device__ int compactSIMDPrefixSum(const T* dsData,const unsigned int* dsValid,T* dsCompact) {
-    __shared__ unsigned int dsLocalIndex[256];
-    int numValid = exclusivePrescan128(dsValid,dsLocalIndex);
-    if (dsValid[threadIdx.x]) dsCompact[dsLocalIndex[threadIdx.x]] = dsData[threadIdx.x];
-    return numValid;
-}
-
-__global__ void moveValidElementsStaged(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
-    __shared__ T inBlock[128];
-    __shared__ unsigned int validBlock[128];
-    __shared__ T compactBlock[128];
-    int blockOutOffset=0;
-    // Sum up the blockCounts before us to find our offset
-    // This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
-    // Paper implements this as a prefix sum kernel in phase II
-    // May still be faster than an extra kernel invocation?
-    for (int base = 0; base < blockIdx.x; base += blockDim.x) {
-        // Load up the count of valid elements for each block before us in batches of 128
-        if ((base + threadIdx.x) < blockIdx.x) {
-            validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
-        } else {
-            validBlock[threadIdx.x] = 0;
-        }
-        __syncthreads();
-        // Parallel reduce these counts
-        // Accumulate in the final offset variable
-        blockOutOffset += sumReduce128(validBlock);
-    }
-
-    size_t ub;
-    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
-    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
-        if ((base + threadIdx.x) < ub) {
-            validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
-            inBlock[threadIdx.x] = dgData[base+threadIdx.x];
-        } else {
-            validBlock[threadIdx.x] = 0;
-        }
-        __syncthreads();
-        int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
-        __syncthreads();
-        if (threadIdx.x < numValidBlock) {
-            dgCompact[blockOutOffset + threadIdx.x] = compactBlock[threadIdx.x];
-        }
-        blockOutOffset += numValidBlock;
-    }
-    if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
-        *dNumValidElements = blockOutOffset;
-    }
-}
-
-__global__ void moveValidElementsScattered(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
-    __shared__ T inBlock[128];
-    __shared__ unsigned int validBlock[128];
-    T* compactBlock=dgCompact;
-    size_t blockOutOffset = 0;
-    // Sum up the blockCounts before us to find our offset
-    // This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
-    // Paper implements this as a prefix sum kernel in phase II
-    // May still be faster than an extra kernel invocation?
-    for (int base = 0; base < blockIdx.x; base += blockDim.x) {
-        // Load up the count of valid elements for each block before us in batches of 128
-        if ((base + threadIdx.x) < blockIdx.x) {
-            validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
-        } else {
-            validBlock[threadIdx.x] = 0;
-        }
-        __syncthreads();
-        // Parallel reduce these counts
-        // Accumulate in the final offset variable
-        blockOutOffset += sumReduce128(validBlock);
-    }
-    compactBlock += blockOutOffset;
-    size_t ub;
-    ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
-    for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
-        if ((base + threadIdx.x) < ub) {
-            validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
-            inBlock[threadIdx.x] = dgData[base+threadIdx.x];
-        } else {
-            validBlock[threadIdx.x] = 0;
-        }
-        __syncthreads();
-        int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
-        blockOutOffset += numValidBlock;
-        compactBlock += numValidBlock;
-    }
-    if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
-        *dNumValidElements = blockOutOffset;
-    }
-}
-
-void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput) {
-    int device;
-    cudaGetDevice(&device);
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, device);
-    d.nThreadBlocks = 16*deviceProp.multiProcessorCount;
-    cudaMalloc((void**)&(d.dgBlockCounts), d.nThreadBlocks*sizeof(unsigned int));
-    d.stageOutput = stageOutput;
-    // TODO: make sure allocation worked
-    d.valid = true;
-}
-
-void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d) {
-    if (d.valid) cudaFree(d.dgBlockCounts);
-}
-
-int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,T* dOut,const T* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid) {
-    if (!d.valid) {
-        return -1;
-    }
-    // Figure out # elements per block
-    unsigned int numBlocks = d.nThreadBlocks;
-    if (numBlocks*128 > len)
-        numBlocks = (len+127)/128;
-    const size_t eltsPerBlock = len/numBlocks + ((len % numBlocks) ? 1 : 0);
-
-    // TODO: implement loop over blocks of 10M
-    // Phase 1: Calculate number of valid elements per thread block
-    countElts<<<numBlocks,128>>>(d.dgBlockCounts,dValid,eltsPerBlock,len);
-
-    // Phase 2/3: Move valid elements using SIMD compaction
-    if (d.stageOutput) {
-        moveValidElementsStaged<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
-    } else {
-        moveValidElementsScattered<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
-    }
-    return 0;
-}
--- a/platforms/cuda-old/src/kernels/cudaCompact.h
+++ b/platforms/cuda-old/src/kernels/cudaCompact.h
-#ifndef __OPENMM_CUDACOMPACT_H__
-#define __OPENMM_CUDACOMPACT_H__
-
-/* Code for CUDA stream compaction. Roughly based on:
-    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
-        High Performance Graphics 2009.
-
-    Notes:
-        - paper recommends 128 threads/block, so this is hard coded.
-        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
-          complicated and performs poorly on current hardware
-        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
-          performance across most of the tested workloads in the paper. The selective variant is not
-          implemented.
-        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
-          manner. It is, however, done in a very easy to program manner, and integrated into the top of
-          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
-          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
-          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
-          dgBlockCounts in stages.
-
-  Date:         23 Aug 2009
-  Author:       Imran Haque (ihaque@cs.stanford.edu)
-  Affiliation:  Stanford University
-  License:      Public Domain
-*/
-
-#include "windowsExportCuda.h"
-
-struct compactionPlan {
-    bool valid;
-    unsigned int* dgBlockCounts;
-    unsigned int nThreadBlocks;
-    bool stageOutput;
-};
-
-extern "C"
-void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput=true);
-
-extern "C"
-void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d);
-
-extern "C"
-int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,unsigned int* dOut,const unsigned int* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid);
-
-#endif // __OPENMM_CUDACOMPACT_H__
--- a/platforms/cuda-old/src/kernels/cudaKernels.h
+++ b/platforms/cuda-old/src/kernels/cudaKernels.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "gputypes.h"
-
-// Initialization
-extern void OPENMMCUDA_EXPORT kClearForces(gpuContext gpu);
-extern void kClearEnergy(gpuContext gpu);
-extern void kClearBornSumAndForces(gpuContext gpu);
-extern void kClearObcGbsaBornSum(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT kReduceObcGbsaBornSum(gpuContext gpu);
-extern void kCalculateGBVIBornSum(gpuContext gpu);
-extern void kReduceGBVIBornSum(gpuContext gpu);
-extern void kClearGBVIBornSum( gpuContext gpu );
-extern void kGenerateRandoms(gpuContext gpu);
-
-// Main loop
-extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
-extern void kCalculateCDLJGBVIForces1(gpuContext gpu);
-extern void kCalculateCDLJForces(gpuContext gpu);
-extern void kCalculateCMAPTorsionForces(gpuContext gpu, CUDAStream<float4>& coefficients, CUDAStream<int2>& mapPositions, CUDAStream<int4>& torsionIndices, CUDAStream<int>& torsionMaps);
-extern void kCalculateCustomBondForces(gpuContext gpu);
-extern void kCalculateCustomAngleForces(gpuContext gpu);
-extern void kCalculateCustomTorsionForces(gpuContext gpu);
-extern void kCalculateCustomExternalForces(gpuContext gpu);
-extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
-extern void kReduceObcGbsaBornForces(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT kCalculateObcGbsaForces2(gpuContext gpu);
-extern void kCalculateGBVIForces2(gpuContext gpu);
-extern void kCalculateLocalForces(gpuContext gpu);
-extern void kCalculateAndersenThermostat(gpuContext gpu, CUDAStream<int>& atomGroups);
-extern void kReduceBornSumAndForces(gpuContext gpu);
-extern void kApplyShake(gpuContext gpu);
-extern void kApplyCCMA(gpuContext gpu);
-extern void kApplySettle(gpuContext gpu);
-extern void kLangevinUpdatePart1(gpuContext gpu);
-extern void kLangevinUpdatePart2(gpuContext gpu);
-extern void kSelectLangevinStepSize(gpuContext gpu, float maxTimeStep);
-extern void kSetVelocitiesFromPositions(gpuContext gpu);
-extern void kVerletUpdatePart1(gpuContext gpu);
-extern void kVerletUpdatePart2(gpuContext gpu);
-extern void kSelectVerletStepSize(gpuContext gpu, float maxTimeStep);
-extern void kBrownianUpdatePart1(gpuContext gpu);
-extern void kBrownianUpdatePart2(gpuContext gpu);
-extern void kScaleAtomCoordinates(gpuContext gpu, float scale, CUDAStream<int>& moleculeAtoms, CUDAStream<int>& moleculeStartIndex);
-extern void kApplyConstraints(gpuContext gpu);
-
-// Extras
-extern void OPENMMCUDA_EXPORT kReduceForces(gpuContext gpu);
-extern double kReduceEnergy(gpuContext gpu);
-
-// Initializers
-extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
-extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
-extern void SetCalculateCDLJForcesSim(gpuContext gpu);
-extern void GetCalculateCDLJForcesSim(gpuContext gpu);
-extern void SetCalculateCustomBondForcesSim(gpuContext gpu);
-extern void GetCalculateCustomBondForcesSim(gpuContext gpu);
-extern void SetCalculateCustomAngleForcesSim(gpuContext gpu);
-extern void GetCalculateCustomAngleForcesSim(gpuContext gpu);
-extern void SetCalculateCustomTorsionForcesSim(gpuContext gpu);
-extern void GetCalculateCustomTorsionForcesSim(gpuContext gpu);
-extern void SetCalculateCustomExternalForcesSim(gpuContext gpu);
-extern void GetCalculateCustomExternalForcesSim(gpuContext gpu);
-extern void SetCalculateCustomNonbondedForcesSim(gpuContext gpu);
-extern void GetCalculateCustomNonbondedForcesSim(gpuContext gpu);
-extern void SetCalculateLocalForcesSim(gpuContext gpu);
-extern void GetCalculateLocalForcesSim(gpuContext gpu);
-extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
-extern void GetCalculateObcGbsaBornSumSim(gpuContext gpu);
-extern void SetCalculateGBVIBornSumSim(gpuContext gpu);
-extern void GetCalculateGBVIBornSumSim(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
-extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
-extern void SetCalculateGBVIForces2Sim(gpuContext gpu);
-extern void GetCalculateGBVIForces2Sim(gpuContext gpu);
-extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
-extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
-extern void SetCalculatePMESim(gpuContext gpu);
-extern void GetCalculatePMESim(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
-extern void GetForcesSim(gpuContext gpu);
-extern void SetShakeHSim(gpuContext gpu);
-extern void GetShakeHSim(gpuContext gpu);
-extern void SetLangevinUpdateSim(gpuContext gpu);
-extern void GetLangevinUpdateSim(gpuContext gpu);
-extern void SetSettleSim(gpuContext gpu);
-extern void GetSettleSim(gpuContext gpu);
-extern void SetCCMASim(gpuContext gpu);
-extern void GetCCMASim(gpuContext gpu);
-extern void SetVerletUpdateSim(gpuContext gpu);
-extern void GetVerletUpdateSim(gpuContext gpu);
-extern void SetBrownianUpdateSim(gpuContext gpu);
-extern void GetBrownianUpdateSim(gpuContext gpu);
-extern void SetRandomSim(gpuContext gpu);
-extern void GetRandomSim(gpuContext gpu);
-extern void SetCustomBondForceExpression(const Expression<256>& expression);
-extern void SetCustomBondEnergyExpression(const Expression<256>& expression);
-extern void SetCustomBondGlobalParams(const std::vector<float>&  paramValues);
-extern void SetCustomAngleForceExpression(const Expression<256>& expression);
-extern void SetCustomAngleEnergyExpression(const Expression<256>& expression);
-extern void SetCustomAngleGlobalParams(const std::vector<float>&  paramValues);
-extern void SetCustomTorsionForceExpression(const Expression<256>& expression);
-extern void SetCustomTorsionEnergyExpression(const Expression<256>& expression);
-extern void SetCustomTorsionGlobalParams(const std::vector<float>&  paramValues);
-extern void SetCustomExternalForceExpressions(const Expression<256>& expressionX, const Expression<256>& expressionY, const Expression<256>& expressionZ);
-extern void SetCustomExternalEnergyExpression(const Expression<256>& expression);
-extern void SetCustomExternalGlobalParams(const std::vector<float>& paramValues);
-extern void SetCustomNonbondedForceExpression(const Expression<256>& expression);
-extern void SetCustomNonbondedEnergyExpression(const Expression<256>& expression);
-extern void SetCustomNonbondedGlobalParams(const std::vector<float>& paramValues);
-
-extern void kPrintGBVI( gpuContext gpu, std::string callId, int call, FILE* log);
-extern void kPrintObc( gpuContext gpu, std::string callId, int call, FILE* log);
-
--- a/platforms/cuda-old/src/kernels/cudatypes.h
+++ b/platforms/cuda-old/src/kernels/cudatypes.h
--- a/platforms/cuda-old/src/kernels/gpu.cpp
+++ b/platforms/cuda-old/src/kernels/gpu.cpp
--- a/platforms/cuda-old/src/kernels/gputypes.h
+++ b/platforms/cuda-old/src/kernels/gputypes.h