Continuing implementation of Cuda platform

ff88ddad · Peter Eastman · 50090f90 · ff88ddad · ff88ddad · ff88ddad
Commit ff88ddad authored Aug 14, 2008 by Peter Eastman
12 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,7 +63,7 @@ ENDIF (NOT CMAKE_CXX_FLAGS_RELEASE)
-# Collect up information about the version of the simbody library we're building
+# Collect up information about the version of the OpenMM library we're building
 # and make it available to the code so it can be built into the binaries.
 SET(OPENMM_LIBRARY_NAME OpenMM)
@@ -206,7 +206,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
-    ## Make sure we find these locally before looking in SimTK/core/include if
+    ## Make sure we find these locally before looking in OpenMM/include if
    ## OpenMM was previously installed there.
    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
 ENDFOREACH(subdir)

--- a/platforms/cuda/CMakeLists.txt
+++ b/platforms/cuda/CMakeLists.txt
+#---------------------------------------------------
+# OpenMM CUDA Platform
+#
+# Creates OpenMM library, base name=OpenMMCuda.
+# Default libraries are shared & optimized. Variants
+# are created for static (_static) and debug (_d).
+#
+# Windows:
+#   OpenMMCuda[_d].dll
+#   OpenMMCuda[_d].lib
+#   OpenMMCuda_static[_d].lib
+# Unix:
+#   libOpenMMCuda[_d].so
+#   libOpenMMCuda_static[_d].a
+#----------------------------------------------------
+SUBDIRS (tests)
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
+SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
+SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
+# Ensure that debug libraries have "_d" appended to their names.
+# CMake gets this right on Windows automatically with this definition.
+IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
+ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+# But on Unix or Cygwin we have to add the suffix manually
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_TARGET ${SHARED_TARGET}_d)
+    SET(STATIC_TARGET ${STATIC_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include 
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
+# and absolute pathnames.
+SET(API_REL_INCLUDE_FILES)   # start these out empty
+SET(API_ABS_INCLUDE_FILES)
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
+    FOREACH(pathname ${fullpaths})
+        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
+        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
+    ENDFOREACH(pathname)
+ENDFOREACH(dir)
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*/*.cpp)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+#
+# Include CUDA related files.
+#
+INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake/FindCuda.cmake)
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
+LINK_DIRECTORIES(${CUDA_TARGET_LINK})
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+#    FILE(GLOB src_files  ${subdir}/src/*.cu  ${subdir}/src/*/*.cu)
+#    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    CUDA_INCLUDE_DIRECTORIES(BEFORE ${PROJECT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+CUDA_ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
--- a/platforms/cuda/cuda-cmake/CudaDependency.cmake
+++ b/platforms/cuda/cuda-cmake/CudaDependency.cmake
+#  For more information, please see: http://software.sci.utah.edu
+#
+#  The MIT License
+#
+#  Copyright (c) 2007
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  License for the specific language governing rights and limitations under
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+#  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+#  DEALINGS IN THE SOFTWARE.
+# This code is based on the Manta swig/python wrapper dependency checking code.
+# -- Abe Stephens
+#####################################################################
+## CUDA_INCLUDE_NVCC_DEPENDENCIES
+##
+# So we want to try and include the dependency file if it exists.  If
+# it doesn't exist then we need to create an empty one, so we can
+# include it.
+# If it does exist, then we need to check to see if all the files it
+# depends on exist.  If they don't then we should clear the dependency
+# file and regenerate it later.  This covers the case where a header
+# file has disappeared or moved.
+MACRO(CUDA_INCLUDE_NVCC_DEPENDENCIES dependency_file)
+  SET(CUDA_NVCC_DEPEND)
+  SET(CUDA_NVCC_DEPEND_REGENERATE)
+  # Include the dependency file.  Create it first if it doesn't exist
+  # for make files except for IDEs (see below).  The INCLUDE puts a
+  # dependency that will force CMake to rerun and bring in the new info
+  # when it changes.  DO NOT REMOVE THIS (as I did and spent a few hours
+  # figuring out why it didn't work.
+  IF(${CMAKE_MAKE_PROGRAM} MATCHES "make")
+    IF(NOT EXISTS ${dependency_file})
+      CONFIGURE_FILE(
+        ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake/empty.depend.in
+        ${dependency_file} IMMEDIATE)
+    ENDIF(NOT EXISTS ${dependency_file})
+    # Always include this file to force CMake to run again next
+    # invocation and rebuild the dependencies.
+    INCLUDE(${dependency_file})
+  ELSE(${CMAKE_MAKE_PROGRAM} MATCHES "make")
+    # for IDE generators like MS dev only include the depend files
+    # if they exist.   This is to prevent ecessive reloading of
+    # workspaces after each build.   This also means
+    # that the depends will not be correct until cmake
+    # is run once after the build has completed once.
+    # the depend files are created in the wrap tcl/python sections
+    # when the .xml file is parsed.
+    INCLUDE(${dependency_file} OPTIONAL)
+  ENDIF(${CMAKE_MAKE_PROGRAM} MATCHES "make")
+  # Now we need to verify the existence of all the included files
+  # here.  If they aren't there we need to just blank this variable and
+  # make the file regenerate again.
+  IF(CUDA_NVCC_DEPEND)
+    FOREACH(f ${CUDA_NVCC_DEPEND})
+      IF(EXISTS ${f})
+      ELSE(EXISTS ${f})
+        SET(CUDA_NVCC_DEPEND_REGENERATE 1)
+      ENDIF(EXISTS ${f})
+    ENDFOREACH(f)
+  ELSE(CUDA_NVCC_DEPEND)
+    # No dependencies, so regenerate the file.
+    SET(CABLE_NVCC_DEPEND_REGENERATE 1)
+  ENDIF(CUDA_NVCC_DEPEND)
+  # No incoming dependencies, so we need to generate them.  Make the
+  # output depend on the dependency file itself, which should cause the
+  # rule to re-run.
+  IF(CUDA_NVCC_DEPEND_REGENERATE)
+    SET(CUDA_NVCC_DEPEND ${dependency_file})
+    # Force CMake to run again next build
+    CONFIGURE_FILE(
+      ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake/empty.depend.in
+      ${dependency_file} IMMEDIATE)
+  ENDIF(CUDA_NVCC_DEPEND_REGENERATE)
+ENDMACRO(CUDA_INCLUDE_NVCC_DEPENDENCIES)
\ No newline at end of file
--- a/platforms/cuda/cuda-cmake/FindCuda.cmake
+++ b/platforms/cuda/cuda-cmake/FindCuda.cmake
+###############################################################################
+#  For more information, please see: http://software.sci.utah.edu
+#
+#  The MIT License
+#
+#  Copyright (c) 2007-2008
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  License for the specific language governing rights and limitations under
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+#  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+#  DEALINGS IN THE SOFTWARE.
+#
+# This script locates the Nvidia Compute Unified Driver Architecture (CUDA) 
+# tools. It should on both linux and windows, and should be reasonably up to 
+# date with cuda releases.
+#
+# The script will prompt the user to specify CUDA_INSTALL_PREFIX if the 
+# prefix cannot be determined by the location of nvcc in the system path. To
+# use a different installed version of the toolkit set the environment variable
+# CUDA_BIN_PATH before running cmake (e.g. CUDA_BIN_PATH=/usr/local/cuda1.0 
+# instead of the default /usr/local/cuda).
+#
+# Set CUDA_BUILD_TYPE to "Device" or "Emulation" mode.
+# _DEVICEEMU is defined in "Emulation" mode.
+#
+# Set CUDA_BUILD_CUBIN to "ON" or "OFF" to enable and extra compilation pass
+# with the -cubin option in Device mode. 
+#
+# The output is parsed and register, shared memory usage is printed during 
+# build. Default ON.
+# 
+# The script creates the following macros:
+# CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
+# -- Sets the directories that should be passed to nvcc 
+#    (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu 
+#    files.
+# 
+# CUDA_ADD_LIBRARY( cuda_target file0 file1 ... )
+# -- Creates a shared library "cuda_target" which contains all of the source 
+#    (*.c, *.cc, etc.) specified and all of the nvcc'ed .cu files specified.
+#    All of the specified source files and generated .c files are compiled 
+#    using the standard CMake compiler, so the normal INCLUDE_DIRECTORIES, 
+#    LINK_DIRECTORIES, and TARGET_LINK_LIBRARIES can be used to affect their
+#    build and link.
+#
+# CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ... )
+# -- Same as CUDA_ADD_LIBRARY except that an exectuable is created.
+#
+# The script defines the following variables:
+#
+# ( Note CUDA_ADD_* macros setup cuda/cut library dependencies automatically. 
+# These variables are only needed if a cuda API call must be made from code in 
+# a outside library or executable. )
+#
+# CUDA_INCLUDE         -- Include directory for cuda headers.
+# CUDA_TARGET_LINK     -- Cuda RT library. 
+# CUDA_CUT_INCLUDE     -- Include directory for cuda SDK headers (cutil.h).   
+# CUDA_CUT_TARGET_LINK -- SDK libraries.
+# CUDA_NVCC_FLAGS      -- Additional NVCC command line arguments. NOTE: 
+#                         multiple arguments must be semi-colon delimited 
+#                         e.g. --compiler-options;-Wall
+#
+# It might be necessary to set CUDA_INSTALL_PATH manually on certain platforms,
+# or to use a cuda runtime not installed in the default location. In newer 
+# versions of the toolkit the cuda library is included with the graphics 
+# driver- be sure that the driver version matches what is needed by the cuda 
+# runtime version.
+# 
+# -- Abe Stephens SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+###############################################################################
+# FindCuda.cmake
+SET(CMAKE_BACKWARDS_COMPATIBILITY 2.2)
+INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake/CudaDependency.cmake)
+###############################################################################
+###############################################################################
+# Locate CUDA, Set Build Type, etc.
+###############################################################################
+###############################################################################
+# Parse CUDA build type.
+IF (NOT CUDA_BUILD_TYPE)
+  SET(CUDA_BUILD_TYPE "Emulation" CACHE STRING "Cuda build type: Emulation or Device")
+ENDIF(NOT CUDA_BUILD_TYPE)
+# Emulation if the card isn't present.
+IF (CUDA_BUILD_TYPE MATCHES "Emulation")
+  # Emulation.
+  SET(nvcc_flags --device-emulation -D_DEVICEEMU -g)
+ELSE(CUDA_BUILD_TYPE MATCHES "Emulation")
+  # Device present.
+  SET(nvcc_flags "")
+ENDIF(CUDA_BUILD_TYPE MATCHES "Emulation")
+SET(CUDA_BUILD_CUBIN TRUE CACHE BOOL "Generate and parse .cubin files in Device mode.")
+SET(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
+# Search for the cuda distribution.
+IF(NOT CUDA_INSTALL_PREFIX)
+  FIND_PATH(CUDA_INSTALL_PREFIX
+    NAMES nvcc
+    PATHS /usr/local/cuda
+    PATH_SUFFIXES bin
+    ENV CUDA_BIN_PATH
+    DOC "Toolkit location."
+    )
+  IF (CUDA_INSTALL_PREFIX) 
+    STRING(REGEX REPLACE "[/\\\\]?bin[/\\\\]?$" "" CUDA_INSTALL_PREFIX ${CUDA_INSTALL_PREFIX})
+  ENDIF(CUDA_INSTALL_PREFIX)
+  IF (NOT EXISTS ${CUDA_INSTALL_PREFIX})
+    MESSAGE(FATAL_ERROR "Specify CUDA_INSTALL_PREFIX")
+  ENDIF (NOT EXISTS ${CUDA_INSTALL_PREFIX})
+ENDIF (NOT CUDA_INSTALL_PREFIX)
+# CUDA_NVCC
+IF (NOT CUDA_NVCC)
+  FIND_PROGRAM(CUDA_NVCC 
+    nvcc
+    PATHS ${CUDA_INSTALL_PREFIX}/bin $ENV{CUDA_BIN_PATH}
+    )
+  IF(NOT CUDA_NVCC)
+    MESSAGE(FATAL_ERROR "Could not find nvcc")
+  ELSE(NOT CUDA_NVCC)
+    MARK_AS_ADVANCED(CUDA_NVCC)
+  ENDIF(NOT CUDA_NVCC)
+ENDIF(NOT CUDA_NVCC)
+# CUDA_NVCC_INCLUDE_ARGS
+# IF (NOT FOUND_CUDA_NVCC_INCLUDE)
+  FIND_PATH(FOUND_CUDA_NVCC_INCLUDE
+    device_functions.h # Header included in toolkit
+    PATHS ${CUDA_INSTALL_PREFIX}/include 
+          $ENV{CUDA_INC_PATH}
+    )
+  IF(NOT FOUND_CUDA_NVCC_INCLUDE)
+    MESSAGE(FATAL_ERROR "Could not find Cuda headers")
+  ELSE(NOT FOUND_CUDA_NVCC_INCLUDE)
+    # Set the initial include dir.
+    SET (CUDA_NVCC_INCLUDE_ARGS "-I"${FOUND_CUDA_NVCC_INCLUDE})
+	SET (CUDA_INCLUDE ${FOUND_CUDA_NVCC_INCLUDE})
+    MARK_AS_ADVANCED(
+      FOUND_CUDA_NVCC_INCLUDE
+      CUDA_NVCC_INCLUDE_ARGS
+      )
+  ENDIF(NOT FOUND_CUDA_NVCC_INCLUDE)
+# ENDIF(NOT FOUND_CUDA_NVCC_INCLUDE)
+# CUDA_TARGET_LINK
+IF (NOT CUDA_TARGET_LINK)
+  FIND_LIBRARY(FOUND_CUDART
+    cudart
+    PATHS ${CUDA_INSTALL_PREFIX}/lib $ENV{CUDA_LIB_PATH}
+    DOC "\"cudart\" library"
+    )
+  # Check to see if cudart library was found.
+  IF(NOT FOUND_CUDART)
+    MESSAGE(FATAL_ERROR "Could not find cudart library (cudart)")
+  ENDIF(NOT FOUND_CUDART)  
+  # 1.1 toolkit on linux doesn't appear to have a separate library on 
+  # some platforms.
+  FIND_LIBRARY(FOUND_CUDA
+    cuda
+    PATHS ${CUDA_INSTALL_PREFIX}/lib
+    DOC "\"cuda\" library (older versions only)."
+    NO_DEFAULT_PATH
+    NO_CMAKE_ENVIRONMENT_PATH
+    NO_CMAKE_PATH
+    NO_SYSTEM_ENVIRONMENT_PATH
+    NO_CMAKE_SYSTEM_PATH
+    )
+  # Add cuda library to the link line only if it is found.
+  IF (FOUND_CUDA)
+    SET(CUDA_TARGET_LINK ${FOUND_CUDA})
+  ENDIF(FOUND_CUDA)
+  # Always add cudart to the link line.
+  IF(FOUND_CUDART)
+    SET(CUDA_TARGET_LINK
+      ${CUDA_TARGET_LINK} ${FOUND_CUDART}
+      )
+    MARK_AS_ADVANCED(
+      CUDA_TARGET_LINK 
+      CUDA_LIB
+      FOUND_CUDA
+      FOUND_CUDART
+      )
+  ELSE(FOUND_CUDART)
+    MESSAGE(FATAL_ERROR "Could not find cuda libraries.")
+  ENDIF(FOUND_CUDART)
+ENDIF(NOT CUDA_TARGET_LINK)
+# CUDA_CUT_INCLUDE
+IF(NOT CUDA_CUT_INCLUDE)
+  FIND_PATH(FOUND_CUT_INCLUDE
+    cutil.h
+    PATHS ${CUDA_INSTALL_PREFIX}/local/NVSDK0.2/common/inc
+          ${CUDA_INSTALL_PREFIX}/NVSDK0.2/common/inc
+          ${CUDA_INSTALL_PREFIX}/NV_CUDA_SDK/common/inc
+          $ENV{HOME}/NVIDIA_CUDA_SDK/common/inc
+          $ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX/common/inc
+          $ENV{NVSDKCUDA_ROOT}/common/inc
+    DOC "Location of cutil.h"
+    )
+  IF(FOUND_CUT_INCLUDE)
+    SET(CUDA_CUT_INCLUDE ${FOUND_CUT_INCLUDE})
+    MARK_AS_ADVANCED(
+      FOUND_CUT_INCLUDE
+      )
+  ENDIF(FOUND_CUT_INCLUDE)
+ENDIF(NOT CUDA_CUT_INCLUDE)
+# CUDA_CUT_TARGET_LINK
+IF(NOT CUDA_CUT_TARGET_LINK)
+  FIND_LIBRARY(FOUND_CUT
+    cutil
+    cutil32
+    PATHS ${CUDA_INSTALL_PREFIX}/local/NVSDK0.2/lib
+          ${CUDA_INSTALL_PREFIX}/NVSDK0.2/lib
+          ${CUDA_INSTALL_PREFIX}/NV_CUDA_SDK/lib
+          $ENV{HOME}/NVIDIA_CUDA_SDK/lib
+          $ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX/lib
+          $ENV{NVSDKCUDA_ROOT}/common/lib
+    NO_DEFAULT_PATH
+    NO_CMAKE_ENVIRONMENT_PATH
+    NO_CMAKE_PATH
+    NO_SYSTEM_ENVIRONMENT_PATH
+    NO_CMAKE_SYSTEM_PATH
+    DOC "Location of cutil library"
+    )
+  IF(FOUND_CUT)
+    SET(CUDA_CUT_TARGET_LINK ${FOUND_CUT})
+    MARK_AS_ADVANCED(
+      FOUND_CUT
+      )
+  ENDIF(FOUND_CUT)
+ENDIF(NOT CUDA_CUT_TARGET_LINK)
+###############################################################################
+# Add include directories to pass to the nvcc command.
+MACRO(CUDA_INCLUDE_DIRECTORIES)
+  FOREACH(dir ${ARGN})
+    SET(CUDA_NVCC_INCLUDE_ARGS ${CUDA_NVCC_INCLUDE_ARGS} -I${dir})
+  ENDFOREACH(dir ${ARGN})
+ENDMACRO(CUDA_INCLUDE_DIRECTORIES)
+##############################################################################
+##############################################################################
+# This helper macro populates the following variables and setups up custom commands and targets to
+# invoke the nvcc compiler. The compiler is invoked once with -M to generate a dependency file and
+# a second time with -cuda to generate a .c file
+# ${target_srcs}
+# ${cuda_cu_sources}
+##############################################################################
+##############################################################################
+MACRO(CUDA_add_custom_commands cuda_target)
+  SET(target_srcs "")
+  SET(cuda_cu_sources "")
+  # Iterate over the macro arguments and create custom
+  # commands for all the .cu files.
+  FOREACH(file ${ARGN})
+    IF(${file} MATCHES ".*\\.cu$")
+    # Add a custom target to generate a cpp file.
+    SET(generated_file  "${CMAKE_BINARY_DIR}/src/cuda/${file}_${cuda_target}_generated.cc")
+    SET(generated_target "${file}_target")
+    FILE(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/src/cuda)
+    SET(source_file ${file})
+    # Note that -cuda generates a .c file not a c++ file.
+    SET_SOURCE_FILES_PROPERTIES(${source_file} PROPERTIES CPLUSPLUS ON)
+    # MESSAGE("${CUDA_NVCC} ${source_file} ${CUDA_NVCC_FLAGS} ${nvcc_flags} -cuda -o ${generated_file} ${CUDA_NVCC_INCLUDE_ARGS}")
+    # Bring in the dependencies.  Creates a variable CUDA_NVCC_DEPEND
+	SET(cmake_dependency_file "${generated_file}.depend")
+	CUDA_INCLUDE_NVCC_DEPENDENCIES(${cmake_dependency_file})
+	SET(NVCC_generated_dependency_file "${generated_file}.NVCC-depend")
+	# Build the NVCC made dependency file
+  IF (CUDA_BUILD_TYPE MATCHES "Device" AND CUDA_BUILD_CUBIN)
+    SET(NVCC_generated_cubin_file "${generated_file}.NVCC-cubin.txt")
+	  ADD_CUSTOM_COMMAND(
+      # Generate the .cubin output.
+      OUTPUT ${NVCC_generated_cubin_file}
+      COMMAND ${CUDA_NVCC}
+      ARGS ${source_file} 
+      ${CUDA_NVCC_FLAGS}
+      ${nvcc_flags}
+      -DNVCC
+      -cubin
+      -o ${NVCC_generated_cubin_file} 
+      ${CUDA_NVCC_INCLUDE_ARGS}
+      # Execute the parser script.
+      COMMAND  ${CMAKE_COMMAND}
+      ARGS 
+      -D input_file="${NVCC_generated_cubin_file}"
+      -P "${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake/parse_cubin.cmake"
+      # MAIN_DEPENDENCY ${source_file}
+      DEPENDS ${source_file}
+      DEPENDS ${CUDA_NVCC_DEPEND}
+	    COMMENT "Building NVCC -cubin File: ${NVCC_generated_cubin_file}\n"
+      )
+  ELSE (CUDA_BUILD_TYPE MATCHES "Device" AND CUDA_BUILD_CUBIN)
+    # Depend on something that will exist.
+    SET(NVCC_generated_cubin_file "${source_file}")
+  ENDIF (CUDA_BUILD_TYPE MATCHES "Device"AND CUDA_BUILD_CUBIN)
+	# Build the NVCC made dependency file
+	ADD_CUSTOM_COMMAND(
+      OUTPUT ${NVCC_generated_dependency_file}
+      COMMAND ${CUDA_NVCC}
+      ARGS ${source_file} 
+           ${CUDA_NVCC_FLAGS}
+           ${nvcc_flags}
+           -DNVCC
+           -M
+           -o ${NVCC_generated_dependency_file} 
+           ${CUDA_NVCC_INCLUDE_ARGS}
+      # MAIN_DEPENDENCY ${source_file}
+      DEPENDS ${source_file}
+      DEPENDS ${CUDA_NVCC_DEPEND}
+	  COMMENT "Building NVCC Dependency File: ${NVCC_generated_dependency_file}\n"
+    )
+    # Build the CMake readible dependency file
+	ADD_CUSTOM_COMMAND(
+	  OUTPUT ${cmake_dependency_file}
+      COMMAND ${CMAKE_COMMAND}
+      ARGS 
+      -D input_file="${NVCC_generated_dependency_file}"
+      -D output_file="${cmake_dependency_file}"
+      -P "${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake/make2cmake.cmake"
+      MAIN_DEPENDENCY ${NVCC_generated_dependency_file}
+      COMMENT "Converting NVCC dependency to CMake (${cmake_dependency_file})"
+    )
+    ADD_CUSTOM_COMMAND(
+      OUTPUT ${generated_file}
+      MAIN_DEPENDENCY ${source_file} 
+      DEPENDS ${CUDA_NVCC_DEPEND}
+      DEPENDS ${cmake_dependency_file}
+      DEPENDS ${NVCC_generated_cubin_file}
+      COMMAND ${CUDA_NVCC} 
+      ARGS ${source_file} 
+           ${CUDA_NVCC_FLAGS}
+           ${nvcc_flags}
+           -DNVCC
+           --keep
+           -cuda -o ${generated_file} 
+           ${CUDA_NVCC_INCLUDE_ARGS}
+       COMMENT "Building NVCC ${source_file}: ${generated_file}\n"
+      )
+    SET(cuda_cu_sources ${cuda_cu_sources} ${source_file})
+    # Add the generated file name to the source list.
+    SET(target_srcs ${target_srcs} ${generated_file})
+    ELSE(${file} MATCHES ".*\\.cu$")
+    # Otherwise add the file name to the source list.
+    SET(target_srcs ${target_srcs} ${file})
+    ENDIF(${file} MATCHES ".*\\.cu$")
+  ENDFOREACH(file)
+ENDMACRO(CUDA_add_custom_commands)
+###############################################################################
+###############################################################################
+# ADD LIBRARY
+###############################################################################
+###############################################################################
+MACRO(CUDA_ADD_LIBRARY cuda_target)
+  # Create custom commands and targets for each file.
+  CUDA_add_custom_commands( ${cuda_target} ${ARGN} )  
+  # Add the library.
+  ADD_LIBRARY(${cuda_target}
+    ${target_srcs}
+    ${cuda_cu_sources}
+    )
+  TARGET_LINK_LIBRARIES(${cuda_target}
+    ${CUDA_TARGET_LINK}
+    )
+ENDMACRO(CUDA_ADD_LIBRARY cuda_target)
+###############################################################################
+###############################################################################
+# ADD EXECUTABLE
+###############################################################################
+###############################################################################
+MACRO(CUDA_ADD_EXECUTABLE cuda_target)
+  # Create custom commands and targets for each file.
+  CUDA_add_custom_commands( ${cuda_target} ${ARGN} )
+  # Add the library.
+  ADD_EXECUTABLE(${cuda_target}
+    ${target_srcs}
+    ${cuda_cu_sources}
+    )
+  TARGET_LINK_LIBRARIES(${cuda_target}
+    ${CUDA_TARGET_LINK}
+    )
+ENDMACRO(CUDA_ADD_EXECUTABLE cuda_target)
--- a/platforms/cuda/cuda-cmake/empty.depend.in
+++ b/platforms/cuda/cuda-cmake/empty.depend.in
--- a/platforms/cuda/cuda-cmake/make2cmake.cmake
+++ b/platforms/cuda/cuda-cmake/make2cmake.cmake
+#  For more information, please see: http://software.sci.utah.edu
+#
+#  The MIT License
+#
+#  Copyright (c) 2007
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  License for the specific language governing rights and limitations under
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+#  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+#  DEALINGS IN THE SOFTWARE.
+# Make2cmake CMake Script
+# Abe Stephens and James Bigler
+# (c) 2007 Scientific Computing and Imaging Institute, University of Utah
+# Note that the REGEX expressions may need to be tweaked for different dependency generators.
+FILE(READ ${input_file} depend_text)
+IF (${depend_text} MATCHES ".+")
+  # MESSAGE("FOUND DEPENDS")
+  # Remember, four backslashes is escaped to one backslash in the string.
+  STRING(REGEX REPLACE "\\\\ " " " depend_text ${depend_text})
+  # This works for the nvcc -M generated dependency files.
+  STRING(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
+  STRING(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
+  FOREACH(file ${depend_text})
+    STRING(REGEX REPLACE "^ +" "" file ${file})
+    # IF (EXISTS ${file})
+	  #   MESSAGE("DEPEND = ${file}")    
+    # ELSE (EXISTS ${file})
+	  #   MESSAGE("ERROR = ${file}")
+    # ENDIF(EXISTS ${file})
+    SET(cuda_nvcc_depend "${cuda_nvcc_depend} \"${file}\"\n")
+  ENDFOREACH(file) 
+ELSE(${depend_text} MATCHES ".+") 
+  # MESSAGE("FOUND NO DEPENDS")
+ENDIF(${depend_text} MATCHES ".+")
+FILE(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
--- a/platforms/cuda/cuda-cmake/parse_cubin.cmake
+++ b/platforms/cuda/cuda-cmake/parse_cubin.cmake
+#  For more information, please see: http://software.sci.utah.edu
+#
+#  The MIT License
+#
+#  Copyright (c) 2007
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  License for the specific language governing rights and limitations under
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+#  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+#  DEALINGS IN THE SOFTWARE.
+# .cubin Parsing CMake Script
+# Abe Stephens
+# (c) 2007 Scientific Computing and Imaging Institute, University of Utah
+FILE(READ ${input_file} file_text)
+IF (${file_text} MATCHES ".+")
+  # Remember, four backslashes is escaped to one backslash in the string.
+  STRING(REGEX REPLACE ";" "\\\\;" file_text ${file_text})
+  STRING(REGEX REPLACE "\ncode" ";code" file_text ${file_text})
+  LIST(LENGTH file_text len)
+  FOREACH(line ${file_text})
+    # Only look at "code { }" blocks.
+    IF(line MATCHES "^code")
+      # Break into individual lines.
+      STRING(REGEX REPLACE "\n" ";" line ${line})
+      FOREACH(entry ${line})
+        # Extract kernel names.
+        IF (${entry} MATCHES "[^g]name = ([^ ]+)")
+          STRING(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+          # Check to see if the kernel name starts with "_"
+          SET(skip FALSE)
+          # IF (${entry} MATCHES "^_")
+            # Skip the rest of this block.
+            # MESSAGE("Skipping ${entry}")
+            # SET(skip TRUE)
+          # ELSE (${entry} MATCHES "^_")
+            MESSAGE("Kernel:    ${entry}")  
+          # ENDIF (${entry} MATCHES "^_")
+        ENDIF(${entry} MATCHES "[^g]name = ([^ ]+)")
+        # Skip the rest of the block if necessary
+        IF(NOT skip)
+          # Registers
+          IF (${entry} MATCHES "reg = ([^ ]+)")
+            STRING(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+            MESSAGE("Registers: ${entry}")
+          ENDIF(${entry} MATCHES "reg = ([^ ]+)")
+          # Local memory
+          IF (${entry} MATCHES "lmem = ([^ ]+)")
+            STRING(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+            MESSAGE("Local:     ${entry}")
+          ENDIF(${entry} MATCHES "lmem = ([^ ]+)")
+          # Shared memory
+          IF (${entry} MATCHES "smem = ([^ ]+)")
+            STRING(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+            MESSAGE("Shared:    ${entry}")
+          ENDIF(${entry} MATCHES "smem = ([^ ]+)")
+          IF (${entry} MATCHES "^}")
+            MESSAGE("")
+          ENDIF(${entry} MATCHES "^}")
+        ENDIF(NOT skip)
+      ENDFOREACH(entry)
+    ENDIF(line MATCHES "^code")
+  ENDFOREACH(line) 
+ELSE(${depend_text} MATCHES ".+") 
+  # MESSAGE("FOUND NO DEPENDS")
+ENDIF(${depend_text} MATCHES ".+")
--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -31,12 +31,14 @@
 #include "CudaKernelFactory.h"
 #include "CudaKernels.h"
+#include "internal/OpenMMContextImpl.h"
 using namespace OpenMM;
 KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, OpenMMContextImpl& context) const {
-//    if (name == CalcStandardMMForceFieldKernel::Name())
+    _gpuContext* gpu = static_cast<_gpuContext*>(context.getPlatformData());
-//        return new CudaCalcStandardMMForceFieldKernel(name, platform);
+    if (name == CalcStandardMMForceFieldKernel::Name())
+        return new CudaCalcStandardMMForceFieldKernel(name, platform, gpu);
 //    if (name == CalcGBSAOBCForceFieldKernel::Name())
 //        return new CudaCalcGBSAOBCForceFieldKernel(name, platform);
 //    if (name == IntegrateVerletStepKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CudaKernels.h"
+#include "CudaStreamImpl.h"
+#include "kernels/gputypes.h"
+#include <cmath>
+extern "C" int gpuSetConstants( gpuContext gpu );
+using namespace OpenMM;
+using namespace std;
+CudaCalcStandardMMForceFieldKernel::~CudaCalcStandardMMForceFieldKernel() {
+}
+void CudaCalcStandardMMForceFieldKernel::initialize(const vector<vector<int> >& bondIndices, const vector<vector<double> >& bondParameters,
+        const vector<vector<int> >& angleIndices, const vector<vector<double> >& angleParameters,
+        const vector<vector<int> >& periodicTorsionIndices, const vector<vector<double> >& periodicTorsionParameters,
+        const vector<vector<int> >& rbTorsionIndices, const vector<vector<double> >& rbTorsionParameters,
+        const vector<vector<int> >& bonded14Indices, double lj14Scale, double coulomb14Scale,
+        const vector<set<int> >& exclusions, const vector<vector<double> >& nonbondedParameters,
+        NonbondedMethod nonbondedMethod, double nonbondedCutoff, double periodicBoxSize[3]) {
+    numAtoms = nonbondedParameters.size();
+    numBonds = bondIndices.size();
+    numAngles = angleIndices.size();
+    numPeriodicTorsions = periodicTorsionIndices.size();
+    numRBTorsions = rbTorsionIndices.size();
+    num14 = bonded14Indices.size();
+    // Initialize bonds.
+    gpu->sim.bonds                      = numBonds;
+    CUDAStream<int4>* psBondID          = new CUDAStream<int4>(numBonds, 1);
+    gpu->psBondID                       = psBondID;
+    gpu->sim.pBondID                    = psBondID->_pDevStream[0];
+    CUDAStream<float2>* psBondParameter = new CUDAStream<float2>(numBonds, 1);
+    gpu->psBondParameter                = psBondParameter;
+    gpu->sim.pBondParameter             = psBondParameter->_pDevStream[0];
+    for (int i = 0; i < numBonds; i++ ) {
+        psBondID->_pSysStream[0][i].x        = bondIndices[i][0];
+        psBondID->_pSysStream[0][i].y        = bondIndices[i][1];
+        psBondID->_pSysStream[0][i].z        = gpu->pOutputBufferCounter[psBondID->_pSysStream[0][i].x]++;
+        psBondID->_pSysStream[0][i].w        = gpu->pOutputBufferCounter[psBondID->_pSysStream[0][i].y]++;
+        psBondParameter->_pSysStream[0][i].x = bondParameters[i][0];
+        psBondParameter->_pSysStream[0][i].y = bondParameters[i][1];
+    }
+    psBondID->Upload();
+    psBondParameter->Upload();
+    // Initialize angles.
+    gpu->sim.bond_angles                     = numAngles;
+    CUDAStream<int4>* psBondAngleID1         = new CUDAStream<int4>(numAngles, 1);
+    gpu->psBondAngleID1                      = psBondAngleID1;
+    gpu->sim.pBondAngleID1                   = psBondAngleID1->_pDevStream[0];
+    CUDAStream<int2>* psBondAngleID2         = new CUDAStream<int2>(numAngles, 1);
+    gpu->psBondAngleID2                      = psBondAngleID2;
+    gpu->sim.pBondAngleID2                   = psBondAngleID2->_pDevStream[0];
+    CUDAStream<float2>* psBondAngleParameter = new CUDAStream<float2>(numAngles, 1);
+    gpu->psBondAngleParameter                = psBondAngleParameter;
+    gpu->sim.pBondAngleParameter             = psBondAngleParameter->_pDevStream[0];
+    for (int i = 0; i < numAngles; i++) {
+        psBondAngleID1->_pSysStream[0][i].x         = angleIndices[i][0];
+        psBondAngleID1->_pSysStream[0][i].y         = angleIndices[i][1];
+        psBondAngleID1->_pSysStream[0][i].z         = angleIndices[i][2];
+        psBondAngleID1->_pSysStream[0][i].w         = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].x]++;
+        psBondAngleID2->_pSysStream[0][i].x         = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].y]++;
+        psBondAngleID2->_pSysStream[0][i].y         = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].z]++;
+        psBondAngleParameter->_pSysStream[0][i].x   = angleParameters[i][0]*180.0/M_PI;
+        psBondAngleParameter->_pSysStream[0][i].y   = angleParameters[i][1];
+    }
+    psBondAngleID1->Upload();
+    psBondAngleID2->Upload();
+    psBondAngleParameter->Upload();
+    // Initialize periodic torsions.
+    gpu->sim.dihedrals = numPeriodicTorsions;
+    CUDAStream<int4>* psDihedralID1             = new CUDAStream<int4>(numPeriodicTorsions, 1);
+    gpu->psDihedralID1                          = psDihedralID1;
+    gpu->sim.pDihedralID1                       = psDihedralID1->_pDevStream[0];
+    CUDAStream<int4>* psDihedralID2             = new CUDAStream<int4>(numPeriodicTorsions, 1);
+    gpu->psDihedralID2                          = psDihedralID2;
+    gpu->sim.pDihedralID2                       = psDihedralID2->_pDevStream[0];
+    CUDAStream<float4>* psDihedralParameter     = new CUDAStream<float4>(numPeriodicTorsions, 1);
+    gpu->psDihedralParameter                    = psDihedralParameter;
+    gpu->sim.pDihedralParameter                 = psDihedralParameter->_pDevStream[0];
+    for (int i = 0; i < numPeriodicTorsions; i++) {
+        psDihedralID1->_pSysStream[0][i].x              = periodicTorsionIndices[i][0];
+        psDihedralID1->_pSysStream[0][i].y              = periodicTorsionIndices[i][1];
+        psDihedralID1->_pSysStream[0][i].z              = periodicTorsionIndices[i][2];
+        psDihedralID1->_pSysStream[0][i].w              = periodicTorsionIndices[i][3];
+        psDihedralID2->_pSysStream[0][i].x              = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].x]++;
+        psDihedralID2->_pSysStream[0][i].y              = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].y]++;
+        psDihedralID2->_pSysStream[0][i].z              = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].z]++;
+        psDihedralID2->_pSysStream[0][i].w              = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].w]++;
+        psDihedralParameter->_pSysStream[0][i].x        = periodicTorsionParameters[i][0];
+        psDihedralParameter->_pSysStream[0][i].y        = periodicTorsionParameters[i][1];
+        psDihedralParameter->_pSysStream[0][i].z        = periodicTorsionParameters[i][2];
+        psDihedralParameter->_pSysStream[0][i].w        = 0.0f;
+    }
+    psDihedralID1->Upload();
+    psDihedralID2->Upload();
+    psDihedralParameter->Upload();
+    // Initialize Ryckaert-Bellemans torsions.
+    gpu->sim.rb_dihedrals = numRBTorsions;
+    CUDAStream<int4>* psRbDihedralID1           = new CUDAStream<int4>(numRBTorsions, 1);
+    gpu->psRbDihedralID1                        = psRbDihedralID1;
+    gpu->sim.pRbDihedralID1                     = psRbDihedralID1->_pDevStream[0];
+    CUDAStream<int4>* psRbDihedralID2           = new CUDAStream<int4>(numRBTorsions, 1);
+    gpu->psRbDihedralID2                        = psRbDihedralID2;
+    gpu->sim.pRbDihedralID2                     = psRbDihedralID2->_pDevStream[0];
+    CUDAStream<float4>* psRbDihedralParameter1  = new CUDAStream<float4>(numRBTorsions, 1);
+    gpu->psRbDihedralParameter1                 = psRbDihedralParameter1;
+    gpu->sim.pRbDihedralParameter1              = psRbDihedralParameter1->_pDevStream[0];
+    CUDAStream<float2>* psRbDihedralParameter2  = new CUDAStream<float2>(numRBTorsions, 1);	
+    gpu->psRbDihedralParameter2                 = psRbDihedralParameter2;
+    gpu->sim.pRbDihedralParameter2              = psRbDihedralParameter2->_pDevStream[0];
+    for (int i = 0; i < numRBTorsions; i++) {
+        psRbDihedralID1->_pSysStream[0][i].x            = rbTorsionIndices[i][0];
+        psRbDihedralID1->_pSysStream[0][i].y            = rbTorsionIndices[i][1];
+        psRbDihedralID1->_pSysStream[0][i].z            = rbTorsionIndices[i][2];
+        psRbDihedralID1->_pSysStream[0][i].w            = rbTorsionIndices[i][3];
+        psRbDihedralID2->_pSysStream[0][i].x            = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].x]++;
+        psRbDihedralID2->_pSysStream[0][i].y            = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].y]++;
+        psRbDihedralID2->_pSysStream[0][i].z            = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].z]++;
+        psRbDihedralID2->_pSysStream[0][i].w            = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].w]++;
+        psRbDihedralParameter1->_pSysStream[0][i].x     = rbTorsionParameters[i][0];
+        psRbDihedralParameter1->_pSysStream[0][i].y     = rbTorsionParameters[i][1];
+        psRbDihedralParameter1->_pSysStream[0][i].z     = rbTorsionParameters[i][2];
+        psRbDihedralParameter1->_pSysStream[0][i].w     = rbTorsionParameters[i][3];
+        psRbDihedralParameter2->_pSysStream[0][i].x     = rbTorsionParameters[i][4];
+        psRbDihedralParameter2->_pSysStream[0][i].y     = rbTorsionParameters[i][5];
+    }
+    psRbDihedralID1->Upload();
+    psRbDihedralID2->Upload();
+    psRbDihedralParameter1->Upload();
+    psRbDihedralParameter2->Upload();
+    // Initialize nonbonded interactions.
+    for (int i = 0; i < numAtoms; i++) {
+        gpu->psPosq4->_pSysStream[0][i].w = nonbondedParameters[i][0];
+        gpu->psSigEps2->_pSysStream[0][i].x = nonbondedParameters[i][1];
+        gpu->psSigEps2->_pSysStream[0][i].y = nonbondedParameters[i][2];
+    }
+    gpu->psPosq4->Upload();
+    gpu->psSigEps2->Upload();
+    // Initialize 1-4 nonbonded interactions.
+    gpu->sim.LJ14s                              = num14;
+    CUDAStream<int4>* psLJ14ID                  = new CUDAStream<int4>(num14, 1);
+    gpu->psLJ14ID                               = psLJ14ID;
+    gpu->sim.pLJ14ID                            = psLJ14ID->_pDevStream[0];
+    CUDAStream<float4>* psLJ14Parameter         = new CUDAStream<float4>(num14, 1);
+    gpu->psLJ14Parameter                        = psLJ14Parameter;
+    gpu->sim.pLJ14Parameter                     = psLJ14Parameter->_pDevStream[0];
+    double sqrtEps = std::sqrt(138.935485);
+    for (int i = 0; i < num14; i++) {
+        int atom1 = bonded14Indices[i][0];
+        int atom2 = bonded14Indices[i][1];
+        double atom1params[] = {0.5*nonbondedParameters[atom1][1], 2.0*sqrt(nonbondedParameters[atom1][2]), nonbondedParameters[atom1][0]*sqrtEps};
+        double atom2params[] = {0.5*nonbondedParameters[atom2][1], 2.0*sqrt(nonbondedParameters[atom2][2]), nonbondedParameters[atom2][0]*sqrtEps};
+        psLJ14ID->_pSysStream[0][i].x          = atom1;
+        psLJ14ID->_pSysStream[0][i].y          = atom2;
+        psLJ14ID->_pSysStream[0][i].z          = gpu->pOutputBufferCounter[psLJ14ID->_pSysStream[0][i].x]++;
+        psLJ14ID->_pSysStream[0][i].w          = gpu->pOutputBufferCounter[psLJ14ID->_pSysStream[0][i].y]++;
+        psLJ14Parameter->_pSysStream[0][i].x   = atom1params[0]+atom2params[0];
+        psLJ14Parameter->_pSysStream[0][i].y   = lj14Scale*(atom1params[1]*atom2params[1]);
+        psLJ14Parameter->_pSysStream[0][i].z   = coulomb14Scale*(atom1params[2]*atom2params[2]);
+    }
+    psLJ14ID->Upload();
+    psLJ14Parameter->Upload();
+    // Initialize exclusions.
+    // TODO
+    // Finish initialization.
+    gpuSetConstants(gpu);
+}
+void CudaCalcStandardMMForceFieldKernel::executeForces(const Stream& positions, Stream& forces) {
+}
+double CudaCalcStandardMMForceFieldKernel::executeEnergy(const Stream& positions) {
+}
+//CudaCalcGBSAOBCForceFieldKernel::~CudaCalcGBSAOBCForceFieldKernel() {
+//}
+//
+//void CudaCalcGBSAOBCForceFieldKernel::initialize(const vector<vector<double> >& atomParameters, double solventDielectric, double soluteDielectric) {
+//}
+//
+//void CudaCalcGBSAOBCForceFieldKernel::executeForces(const Stream& positions, Stream& forces) {
+//}
+//
+//double CudaCalcGBSAOBCForceFieldKernel::executeEnergy(const Stream& positions) {
+//}
+//
+//CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
+//}
+//
+//void CudaIntegrateVerletStepKernel::initialize(const vector<double>& masses, const vector<vector<int> >& constraintIndices,
+//}
+//
+//void CudaIntegrateVerletStepKernel::execute(Stream& positions, Stream& velocities, const Stream& forces, double stepSize) {
+//}
+//
+//CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
+//}
+//
+//void CudaIntegrateLangevinStepKernel::initialize(const vector<double>& masses, const vector<vector<int> >& constraintIndices,
+//        const vector<double>& constraintLengths) {
+//}
+//
+//void CudaIntegrateLangevinStepKernel::execute(Stream& positions, Stream& velocities, const Stream& forces, double temperature, double friction, double stepSize) {
+//}
+//
+//CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
+//}
+//
+//void CudaIntegrateBrownianStepKernel::initialize(const vector<double>& masses, const vector<vector<int> >& constraintIndices,
+//        const vector<double>& constraintLengths) {
+//}
+//
+//void CudaIntegrateBrownianStepKernel::execute(Stream& positions, Stream& velocities, const Stream& forces, double temperature, double friction, double stepSize) {
+//}
+//
+//CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() {
+//}
+//
+//void CudaApplyAndersenThermostatKernel::initialize(const vector<double>& masses) {
+//}
+//
+//void CudaApplyAndersenThermostatKernel::execute(Stream& velocities, double temperature, double collisionFrequency, double stepSize) {
+//}
+//
+//void CudaCalcKineticEnergyKernel::initialize(const vector<double>& masses) {
+//}
+//
+//double CudaCalcKineticEnergyKernel::execute(const Stream& velocities) {
+//}
+//
+//void CudaRemoveCMMotionKernel::initialize(const vector<double>& masses) {
+//}
+//
+//void CudaRemoveCMMotionKernel::execute(Stream& velocities) {
+//}
--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -33,6 +33,7 @@
 * -------------------------------------------------------------------------- */
 #include "kernels.h"
+#include "kernels/gpuTypes.h"
 class CudaAndersenThermostat;
 class CudaBrownianDynamics;
@@ -42,61 +43,63 @@ class CudaVerletDynamics;
 namespace OpenMM {
-///**
+/**
-// * This kernel is invoked by StandardMMForceField to calculate the forces acting on the system.
+ * This kernel is invoked by StandardMMForceField to calculate the forces acting on the system.
-// */
+ */
-//class CudaCalcStandardMMForceFieldKernel : public CalcStandardMMForceFieldKernel {
+class CudaCalcStandardMMForceFieldKernel : public CalcStandardMMForceFieldKernel {
-//public:
+public:
-//    CudaCalcStandardMMForceFieldKernel(std::string name, const Platform& platform) : CalcStandardMMForceFieldKernel(name, platform) {
+    CudaCalcStandardMMForceFieldKernel(std::string name, const Platform& platform, _gpuContext* gpu) : CalcStandardMMForceFieldKernel(name, platform), gpu(gpu) {
-//    }
+    }
-//    ~CudaCalcStandardMMForceFieldKernel();
+    ~CudaCalcStandardMMForceFieldKernel();
-//    /**
+    /**
-//     * Initialize the kernel, setting up the values of all the force field parameters.
+     * Initialize the kernel, setting up the values of all the force field parameters.
-//     * 
+     * 
-//     * @param bondIndices               the two atoms connected by each bond term
+     * @param bondIndices               the two atoms connected by each bond term
-//     * @param bondParameters            the force parameters (length, k) for each bond term
+     * @param bondParameters            the force parameters (length, k) for each bond term
-//     * @param angleIndices              the three atoms connected by each angle term
+     * @param angleIndices              the three atoms connected by each angle term
-//     * @param angleParameters           the force parameters (angle, k) for each angle term
+     * @param angleParameters           the force parameters (angle, k) for each angle term
-//     * @param periodicTorsionIndices    the four atoms connected by each periodic torsion term
+     * @param periodicTorsionIndices    the four atoms connected by each periodic torsion term
-//     * @param periodicTorsionParameters the force parameters (k, phase, periodicity) for each periodic torsion term
+     * @param periodicTorsionParameters the force parameters (k, phase, periodicity) for each periodic torsion term
-//     * @param rbTorsionIndices          the four atoms connected by each Ryckaert-Bellemans torsion term
+     * @param rbTorsionIndices          the four atoms connected by each Ryckaert-Bellemans torsion term
-//     * @param rbTorsionParameters       the coefficients (in order of increasing powers) for each Ryckaert-Bellemans torsion term
+     * @param rbTorsionParameters       the coefficients (in order of increasing powers) for each Ryckaert-Bellemans torsion term
-//     * @param bonded14Indices           each element contains the indices of two atoms whose nonbonded interactions should be reduced since
+     * @param bonded14Indices           each element contains the indices of two atoms whose nonbonded interactions should be reduced since
-//     *                                  they form a bonded 1-4 pair
+     *                                  they form a bonded 1-4 pair
-//     * @param lj14Scale                 the factor by which van der Waals interactions should be reduced for bonded 1-4 pairs
+     * @param lj14Scale                 the factor by which van der Waals interactions should be reduced for bonded 1-4 pairs
-//     * @param coulomb14Scale            the factor by which Coulomb interactions should be reduced for bonded 1-4 pairs
+     * @param coulomb14Scale            the factor by which Coulomb interactions should be reduced for bonded 1-4 pairs
-//     * @param exclusions                the i'th element lists the indices of all atoms with which the i'th atom should not interact through
+     * @param exclusions                the i'th element lists the indices of all atoms with which the i'th atom should not interact through
-//     *                                  nonbonded forces.  Bonded 1-4 pairs are also included in this list, since they should be omitted from
+     *                                  nonbonded forces.  Bonded 1-4 pairs are also included in this list, since they should be omitted from
-//     *                                  the standard nonbonded calculation.
+     *                                  the standard nonbonded calculation.
-//     * @param nonbondedParameters       the nonbonded force parameters (charge, sigma, epsilon) for each atom
+     * @param nonbondedParameters       the nonbonded force parameters (charge, sigma, epsilon) for each atom
-//     */
+     */
-//    void initialize(const std::vector<std::vector<int> >& bondIndices, const std::vector<std::vector<double> >& bondParameters,
+    void initialize(const std::vector<std::vector<int> >& bondIndices, const std::vector<std::vector<double> >& bondParameters,
-//            const std::vector<std::vector<int> >& angleIndices, const std::vector<std::vector<double> >& angleParameters,
+            const std::vector<std::vector<int> >& angleIndices, const std::vector<std::vector<double> >& angleParameters,
-//            const std::vector<std::vector<int> >& periodicTorsionIndices, const std::vector<std::vector<double> >& periodicTorsionParameters,
+            const std::vector<std::vector<int> >& periodicTorsionIndices, const std::vector<std::vector<double> >& periodicTorsionParameters,
-//            const std::vector<std::vector<int> >& rbTorsionIndices, const std::vector<std::vector<double> >& rbTorsionParameters,
+            const std::vector<std::vector<int> >& rbTorsionIndices, const std::vector<std::vector<double> >& rbTorsionParameters,
-//            const std::vector<std::vector<int> >& bonded14Indices, double lj14Scale, double coulomb14Scale,
+            const std::vector<std::vector<int> >& bonded14Indices, double lj14Scale, double coulomb14Scale,
-//            const std::vector<std::set<int> >& exclusions, const std::vector<std::vector<double> >& nonbondedParameters);
+            const std::vector<std::set<int> >& exclusions, const std::vector<std::vector<double> >& nonbondedParameters,
-//    /**
+            NonbondedMethod nonbondedMethod, double nonbondedCutoff, double periodicBoxSize[3]);
-//     * Execute the kernel to calculate the forces.
+    /**
-//     * 
+     * Execute the kernel to calculate the forces.
-//     * @param positions   a Stream of type Double3 containing the position (x, y, z) of each atom
+     * 
-//     * @param forces      a Stream of type Double3 containing the force (x, y, z) on each atom.  On entry, this contains the forces that
+     * @param positions   a Stream of type Double3 containing the position (x, y, z) of each atom
-//     *                    have been calculated so far.  The kernel should add its own forces to the values already in the stream.
+     * @param forces      a Stream of type Double3 containing the force (x, y, z) on each atom.  On entry, this contains the forces that
-//     */
+     *                    have been calculated so far.  The kernel should add its own forces to the values already in the stream.
-//    void executeForces(const Stream& positions, Stream& forces);
+     */
-//    /**
+    void executeForces(const Stream& positions, Stream& forces);
-//     * Execute the kernel to calculate the energy.
+    /**
-//     * 
+     * Execute the kernel to calculate the energy.
-//     * @param positions   a Stream of type Double3 containing the position (x, y, z) of each atom
+     * 
-//     * @return the potential energy due to the StandardMMForceField
+     * @param positions   a Stream of type Double3 containing the position (x, y, z) of each atom
-//     */
+     * @return the potential energy due to the StandardMMForceField
-//    double executeEnergy(const Stream& positions);
+     */
-//private:
+    double executeEnergy(const Stream& positions);
-//    int numAtoms, numBonds, numAngles, numPeriodicTorsions, numRBTorsions, num14;
+private:
+    _gpuContext* gpu;
+    int numAtoms, numBonds, numAngles, numPeriodicTorsions, numRBTorsions, num14;
 //    int **bondIndexArray, **angleIndexArray, **periodicTorsionIndexArray, **rbTorsionIndexArray, **exclusionArray, **bonded14IndexArray;
 //    RealOpenMM **bondParamArray, **angleParamArray, **periodicTorsionParamArray, **rbTorsionParamArray, **atomParamArray, **bonded14ParamArray;
-//};
+};
-//
 ///**
 // * This kernel is invoked by GBSAOBCForceField to calculate the forces acting on the system.
 // */

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -34,12 +34,23 @@
 #include "CudaKernels.h"
 #include "internal/OpenMMContextImpl.h"
 #include "kernels/gpuTypes.h"
+#include "System.h"
+extern "C" gpuContext
+gpuInit( 
+        int natoms,
+        int atomstrwidth,
+        int testmode,
+        FILE *log
+        );
+extern "C"
+void gpuShutDown(gpuContext gpu);
 using namespace OpenMM;
 CudaPlatform::CudaPlatform() {
    CudaKernelFactory* factory = new CudaKernelFactory();
-//    registerKernelFactory(CalcStandardMMForceFieldKernel::Name(), factory);
+    registerKernelFactory(CalcStandardMMForceFieldKernel::Name(), factory);
 //    registerKernelFactory(CalcGBSAOBCForceFieldKernel::Name(), factory);
 //    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
 //    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
@@ -58,10 +69,12 @@ const StreamFactory& CudaPlatform::getDefaultStreamFactory() const {
 }
 void CudaPlatform::contextCreated(OpenMMContextImpl& context) const {
-    context.setPlatformData(new _gpuContext());
+    int numAtoms = context.getSystem().getNumAtoms();
+    _gpuContext* gpu = gpuInit(numAtoms, 0 /* ignored? */, 0, stdout);
+    context.setPlatformData(gpu);
 }
 void CudaPlatform::contextDestroyed(OpenMMContextImpl& context) const {
    _gpuContext* data = reinterpret_cast<_gpuContext*>(context.getPlatformData());
-    delete data;
+    gpuShutDown(data);
 }
--- a/platforms/reference/tests/TestReferenceRandom.cpp
+++ b/platforms/reference/tests/TestReferenceRandom.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the reference implementation of random number generation.
+ */
+#include "../../../tests/AssertionUtilities.h"
+#include "../src/SimTKUtilities/SimTKOpenMMUtilities.h"
+#include <iostream>
+using namespace OpenMM;
+using namespace std;
+void testGaussian() {
+    mt_init(0);
+    const int numValues = 10000000;
+    double mean = 0.0;
+    double var = 0.0;
+    double skew = 0.0;
+    double kurtosis = 0.0;
+    unsigned long jran = 12399103;
+    for (int i = 0; i < numValues; i++) {
+        double value = SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+        mean += value;
+        var += value*value;
+        skew += value*value*value;
+        kurtosis += value*value*value*value;
+    }
+    mean /= numValues;
+    var /= numValues;
+    skew /= numValues;
+    kurtosis /= numValues;
+    double c2 = var-mean*mean;
+    double c3 = skew-3*var*mean+2*mean*mean*mean;
+    double c4 = kurtosis-4*skew*mean-3*var*var+12*var*mean*mean-6*mean*mean*mean*mean;
+    ASSERT_EQUAL_TOL(0.0, mean, 0.01);
+    ASSERT_EQUAL_TOL(1.0, c2, 0.01);
+    ASSERT_EQUAL_TOL(0.0, c3, 0.01);
+    ASSERT_EQUAL_TOL(0.0, c4, 0.01);
+}
+int main() {
+    try {
+        testGaussian();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}