Merge pull request #807 from peastman/compiler

Support CUDA runtime compilation

Merge pull request #807 from peastman/compiler
Support CUDA runtime compilation
e9b7c563 · peastman · 97ce448c · 8fa08870 · e9b7c563 · e9b7c563
Commit e9b7c563 authored Feb 13, 2015 by peastman
12 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -460,6 +460,21 @@ IF(OPENMM_BUILD_PME_PLUGIN)
   ADD_SUBDIRECTORY(plugins/cpupme)
 ENDIF(OPENMM_BUILD_PME_PLUGIN)

+# CUDA compiler plugin
+
+GET_FILENAME_COMPONENT(CUDA_LIB_DIR "${CUDA_cufft_LIBRARY}" PATH)
+FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_LIB_DIR}")
+IF(CUDA_nvrtc_LIBRARY)
+    SET(OPENMM_BUILD_CUDA_COMPILER_PLUGIN ON CACHE BOOL "Build CUDA runtime compiler plugin")
+ELSE(CUDA_nvrtc_LIBRARY)
+    SET(OPENMM_BUILD_CUDA_COMPILER_PLUGIN OFF CACHE BOOL "Build CUDA runtime compiler plugin")
+ENDIF(CUDA_nvrtc_LIBRARY)
+SET(OPENMM_BUILD_CUDACOMPILER_PATH)
+IF(OPENMM_BUILD_CUDA_COMPILER_PLUGIN)
+   SET(OPENMM_BUILD_CUDACOMPILER_PATH ${CMAKE_CURRENT_SOURCE_DIR}/plugins/cudacompiler)
+   ADD_SUBDIRECTORY(plugins/cudacompiler)
+ENDIF(OPENMM_BUILD_CUDA_COMPILER_PLUGIN)
+
 IF(OPENMM_BUILD_SHARED_LIB)
    INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${SHARED_TARGET})
 ENDIF(OPENMM_BUILD_SHARED_LIB)

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -41,6 +41,7 @@
 #include <vector_functions.h>
 #include "windowsExportCuda.h"
 #include "CudaPlatform.h"
+#include "openmm/Kernel.h"

 typedef unsigned int tileflags;

@@ -565,7 +566,7 @@ private:
    int paddedNumAtoms;
    int numAtomBlocks;
    int numThreadBlocks;
-    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, boxIsTriclinic;
+    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, boxIsTriclinic, hasCompilerKernel;
    std::string compiler, tempDir, cacheDir, gpuArchitecture;
    float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
    double4 periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, periodicBoxSize, invPeriodicBoxSize;
@@ -602,6 +603,7 @@ private:
    CudaBondedUtilities* bonded;
    CudaNonbondedUtilities* nonbonded;
    WorkThread* thread;
+    Kernel compilerKernel;
 };

 struct CudaContext::Molecule {

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -38,6 +38,27 @@

 namespace OpenMM {

+/**
+ * This abstract class defines an interface for code that can compile CUDA kernels.  This allows a plugin to take advantage of runtime compilation
+ * when running on recent versions of CUDA.
+ */
+class CudaCompilerKernel : public KernelImpl {
+public:
+    static std::string Name() {
+        return "CudaCompilerKernel";
+    }
+    CudaCompilerKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
+    }
+    /**
+     * Compile a kernel to PTX.
+     *
+     * @param source     the source code for the kernel
+     * @param options    the flags to be passed to the compiler
+     * @param cu         the CudaContext for which the kernel is being compiled
+     */
+    virtual std::string createModule(const std::string& source, const std::string& flags, CudaContext& cu) = 0;
+};
+
 /**
 * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
 * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
@@ -591,9 +612,9 @@ private:
        int getKeySize() const {return 4;}
        const char* getDataType() const {return "int2";}
        const char* getKeyType() const {return "int";}
-        const char* getMinKey() const {return "INT_MIN";}
-        const char* getMaxKey() const {return "INT_MAX";}
-        const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";}
+        const char* getMinKey() const {return "(-2147483647-1)";}
+        const char* getMaxKey() const {return "2147483647";}
+        const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
        const char* getSortKey() const {return "value.y";}
    };
    class PmeIO;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -33,6 +33,7 @@
 #include "CudaBondedUtilities.h"
 #include "CudaForceInfo.h"
 #include "CudaIntegrationUtilities.h"
+#include "CudaKernels.h"
 #include "CudaKernelSources.h"
 #include "CudaNonbondedUtilities.h"
 #include "SHA1.h"
@@ -73,9 +74,16 @@ bool CudaContext::hasInitializedCuda = false;

 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
-        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
-        posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false),
+        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    this->compiler = "\""+compiler+"\"";
+    try {
+        compilerKernel = platformData.context->getPlatform().createKernel(CudaCompilerKernel::Name(), *platformData.context);
+        hasCompilerKernel = true;
+    }
+    catch (...) {
+        // The runtime compiler plugin isn't available.
+    }
    if (hostCompiler.size() > 0)
        this->compiler = compiler+" --compiler-bindir "+hostCompiler;
    if (!hasInitializedCuda) {
@@ -508,7 +516,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
        return module;
    
-    // Write out the source to a temporary file.
+    // Select names for the various temporary files.
    
    stringstream tempFileName;
    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
@@ -520,20 +528,51 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    string inputFile = (tempDir+tempFileName.str()+".cu");
    string outputFile = (tempDir+tempFileName.str()+".ptx");
    string logFile = (tempDir+tempFileName.str()+".log");
-    ofstream out(inputFile.c_str());
-    out << src.str();
-    out.close();
+    int res = 0;
+
+    // If the runtime compiler plugin is available, use it.
+    
+    if (hasCompilerKernel) {
+        string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
+        
+        // If possible, write the PTX out to a temporary file so we can cache it for later use.
+        
+        bool wroteCache = false;
+        try {
+            ofstream out(outputFile.c_str());
+            out << ptx;
+            out.close();
+            if (!out.fail())
+                wroteCache = true;
+        }
+        catch (...) {
+            // Ignore.
+        }
+        if (!wroteCache) {
+            // An error occurred.  Possibly we don't have permission to write to the temp directory.  Just try to load the module directly.
+            
+            CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
+            return module;
+        }
+    }
+    else {
+        // Write out the source to a temporary file.
+
+        ofstream out(inputFile.c_str());
+        out << src.str();
+        out.close();
 #ifdef WIN32
 #ifdef _DEBUG
-    string command = compiler+" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
+        string command = compiler+" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
 #else
-    string command = compiler+" --ptx -lineinfo --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
+        string command = compiler+" --ptx -lineinfo --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
 #endif
-    int res = compileInWindows(command);
+        int res = compileInWindows(command);
 #else
-    string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
-    int res = std::system(command.c_str());
+        string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
+        res = std::system(command.c_str());
 #endif
+    }
    try {
        if (res != 0) {
            // Load the error log.

--- a/plugins/cudacompiler/CMakeLists.txt
+++ b/plugins/cudacompiler/CMakeLists.txt
+#---------------------------------------------------
+# OpenMM CUDA runtime compiler
+#
+# Creates OpenMMCudaCompiler library.
+#
+# Windows:
+#   OpenMMCudaCompiler.dll
+#   OpenMMCudaCompiler.lib
+# Unix:
+#   libOpenMMCudaCompiler.so
+#----------------------------------------------------
+
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+
+
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+
+SET(OPENMMCUDACOMPILER_LIBRARY_NAME OpenMMCudaCompiler)
+
+SET(SHARED_TARGET ${OPENMMCUDACOMPILER_LIBRARY_NAME})
+
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+
+# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
+# and absolute pathnames.
+SET(API_REL_INCLUDE_FILES)   # start these out empty
+SET(API_ABS_INCLUDE_FILES)
+
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
+
+    FOREACH(pathname ${fullpaths})
+        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
+        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
+    ENDFOREACH(pathname)
+ENDFOREACH(dir)
+
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/include)
+INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})
+
+# Build the shared plugin library.
+
+IF (OPENMM_BUILD_SHARED_LIB)
+    ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_INCLUDE_FILES})
+
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${CUDA_nvrtc_LIBRARY})
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDACOMPILER_BUILDING_SHARED_LIBRARY")
+    IF (APPLE)
+        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
+    ELSE (APPLE)
+        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    ENDIF (APPLE)
+
+    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
+ENDIF (OPENMM_BUILD_SHARED_LIB)
+
+# Build the static plugin library.
+
+IF(OPENMM_BUILD_STATIC_LIB)
+    ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_INCLUDE_FILES})
+
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}_static ${CUDA_nvrtc_LIBRARY})
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDACOMPILER_BUILDING_STATIC_LIBRARY")
+    IF (APPLE)
+        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
+    ELSE (APPLE)
+        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    ENDIF (APPLE)
+
+    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
+ENDIF(OPENMM_BUILD_STATIC_LIB)
+
+INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)
+# Ensure that links to the main CUDA library will be resolved.
+IF (APPLE)
+    IF (OPENMM_BUILD_SHARED_LIB)
+        SET(CUDA_LIBRARY libOpenMMCUDA.dylib)
+        INSTALL(CODE "EXECUTE_PROCESS(COMMAND install_name_tool -change ${CUDA_LIBRARY} @loader_path/${CUDA_LIBRARY} ${CMAKE_INSTALL_PREFIX}/lib/plugins/lib${SHARED_TARGET}.dylib)")
+    ENDIF (OPENMM_BUILD_SHARED_LIB)
+ENDIF (APPLE)
+
+if(OPENMM_BUILD_CUDA_TESTS)
+    SUBDIRS (tests)
+endif(OPENMM_BUILD_CUDA_TESTS)
--- a/plugins/cudacompiler/include/internal/windowsExportCudaCompiler.h
+++ b/plugins/cudacompiler/include/internal/windowsExportCudaCompiler.h
+#ifndef OPENMM_WINDOWSEXPORTCUDACOMPILER_H_
+#define OPENMM_WINDOWSEXPORTCUDACOMPILER_H_
+
+/*
+ * Shared libraries are messy in Visual Studio. We have to distinguish three
+ * cases:
+ *   (1) this header is being used to build the OpenMM shared library
+ *       (dllexport)
+ *   (2) this header is being used by a *client* of the OpenMM shared
+ *       library (dllimport)
+ *   (3) we are building the OpenMM static library, or the client is
+ *       being compiled with the expectation of linking with the
+ *       OpenMM static library (nothing special needed)
+ * In the CMake script for building this library, we define one of the symbols
+ *     OPENMM_CUDACOMPILER_BUILDING_{SHARED|STATIC}_LIBRARY
+ * Client code normally has no special symbol defined, in which case we'll
+ * assume it wants to use the shared library. However, if the client defines
+ * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
+ * that the client code can be linked with static libraries. Note that
+ * the client symbol is not library dependent, while the library symbols
+ * affect only the OpenMM library, meaning that other libraries can
+ * be clients of this one. However, we are assuming all-static or all-shared.
+ */
+
+#ifdef _MSC_VER
+    // We don't want to hear about how sprintf is "unsafe".
+    #pragma warning(disable:4996)
+    // Keep MS VC++ quiet about lack of dll export of private members.
+    #pragma warning(disable:4251)
+    #if defined(OPENMM_CUDACOMPILER_BUILDING_SHARED_LIBRARY)
+        #define OPENMM_EXPORT_CUDACOMPILER __declspec(dllexport)
+    #elif defined(OPENMM_CUDACOMPILER_BUILDING_STATIC_LIBRARY) || defined(OPENMM_CUDACOMPILER_USE_STATIC_LIBRARIES)
+        #define OPENMM_EXPORT_CUDACOMPILER
+    #else
+        #define OPENMM_EXPORT_CUDACOMPILER __declspec(dllimport)   // i.e., a client of a shared library
+    #endif
+#else
+    #define OPENMM_EXPORT_CUDACOMPILER // Linux, Mac
+#endif
+
+#endif // OPENMM_WINDOWSEXPORTCUDACOMPILER_H_
--- a/plugins/cudacompiler/src/CudaCompilerKernelFactory.cpp
+++ b/plugins/cudacompiler/src/CudaCompilerKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                           OpenMMCudaCompiler                               *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaCompilerKernelFactory.h"
+#include "CudaCompilerKernels.h"
+#include "internal/windowsExportCudaCompiler.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+
+using namespace OpenMM;
+
+extern "C" OPENMM_EXPORT_CUDACOMPILER void registerKernelFactories() {
+    try {
+        // Make sure this is at least CUDA 7.0.
+        
+        int driverVersion;
+        cuDriverGetVersion(&driverVersion);
+        if (driverVersion >= 7000) {
+            Platform& platform = Platform::getPlatformByName("CUDA");
+            CudaCompilerKernelFactory* factory = new CudaCompilerKernelFactory();
+            platform.registerKernelFactory(CudaCompilerKernel::Name(), factory);
+        }
+    }
+    catch (std::exception ex) {
+        // Ignore
+    }
+}
+
+#ifdef OPENMM_CUDACOMPILER_BUILDING_STATIC_LIBRARY
+extern "C" void registerCudaCompilerKernelFactories() {
+    registerKernelFactories();
+}
+#else
+extern "C" OPENMM_EXPORT_CUDACOMPILER void registerCudaCompilerKernelFactories() {
+    registerKernelFactories();
+}
+extern "C" OPENMM_EXPORT_CUDACOMPILER void registerPlatforms() {
+}
+#endif
+
+KernelImpl* CudaCompilerKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    if (name == CudaCompilerKernel::Name())
+        return new CudaRuntimeCompilerKernel(name, platform);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/plugins/cudacompiler/src/CudaCompilerKernelFactory.h
+++ b/plugins/cudacompiler/src/CudaCompilerKernelFactory.h
+#ifndef OPENMM_CPUCUDACOMPILERKERNELFACTORY_H_
+#define OPENMM_CPUCUDACOMPILERKERNELFACTORY_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/KernelFactory.h"
+
+namespace OpenMM {
+
+/**
+ * This KernelFactory creates kernels for the CUDA runtime compiler.
+ */
+
+class CudaCompilerKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPUCUDACOMPILERKERNELFACTORY_H_*/
--- a/plugins/cudacompiler/src/CudaCompilerKernels.cpp
+++ b/plugins/cudacompiler/src/CudaCompilerKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaCompilerKernels.h"
+#include "openmm/OpenMMException.h"
+#include <sstream>
+#include <nvrtc.h>
+
+using namespace OpenMM;
+using namespace std;
+
+#define CHECK_RESULT(result, prefix) \
+    if (result != NVRTC_SUCCESS) { \
+        stringstream m; \
+        m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
+static string getErrorString(nvrtcResult result) {
+    switch (result) {
+        case NVRTC_SUCCESS: return "NVRTC_SUCCESS";
+        case NVRTC_ERROR_OUT_OF_MEMORY: return "NVRTC_ERROR_OUT_OF_MEMORY";
+        case NVRTC_ERROR_PROGRAM_CREATION_FAILURE: return "NVRTC_ERROR_PROGRAM_CREATION_FAILURE";
+        case NVRTC_ERROR_INVALID_INPUT: return "NVRTC_ERROR_INVALID_INPUT";
+        case NVRTC_ERROR_INVALID_PROGRAM: return "NVRTC_ERROR_INVALID_PROGRAM";
+        case NVRTC_ERROR_INVALID_OPTION: return "NVRTC_ERROR_INVALID_OPTION";
+        case NVRTC_ERROR_COMPILATION: return "NVRTC_ERROR_COMPILATION";
+        case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: return "NVRTC_ERROR_BUILTIN_OPERATION_FAILURE";
+    }
+    return "NVRTC error";
+}
+
+string CudaRuntimeCompilerKernel::createModule(const string& source, const string& flags, CudaContext& cu) {
+    // Split the command line flags into an array of options.
+    
+    stringstream flagsStream(flags);
+    string flag;
+    vector<string> splitFlags;
+    while (flagsStream >> flag)
+        splitFlags.push_back(flag);
+    int numOptions = splitFlags.size();
+    vector<const char*> options(numOptions);
+    for (int i = 0; i < numOptions; i++)
+        options[i] = &splitFlags[i][0];
+    
+    // Compile the program to PTX.
+    
+    nvrtcProgram program;
+    CHECK_RESULT(nvrtcCreateProgram(&program, source.c_str(), "", 0, NULL, NULL), "Error creating program");
+    try {
+        nvrtcResult result = nvrtcCompileProgram(program, options.size(), &options[0]);
+        if (result != NVRTC_SUCCESS) {
+            size_t logSize;
+            nvrtcGetProgramLogSize(program, &logSize);
+            vector<char> log(logSize);
+            nvrtcGetProgramLog(program, &log[0]);
+            throw OpenMMException("Error compiling program: "+string(&log[0]));
+        }
+        size_t ptxSize;
+        nvrtcGetPTXSize(program, &ptxSize);
+        vector<char> ptx(ptxSize);
+        nvrtcGetPTX(program, &ptx[0]);
+        nvrtcDestroyProgram(&program);
+        return string(&ptx[0]);
+    }
+    catch (...) {
+        nvrtcDestroyProgram(&program);
+        throw;
+    }
+}
--- a/plugins/cudacompiler/src/CudaCompilerKernels.h
+++ b/plugins/cudacompiler/src/CudaCompilerKernels.h
+#ifndef OPENMM_CUDACOMPILER_KERNELS_H_
+#define OPENMM_CUDACOMPILER_KERNELS_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "internal/windowsExportCudaCompiler.h"
+#include "CudaKernels.h"
+#include <string>
+
+namespace OpenMM {
+
+/**
+ * This kernel is a compiler for CUDA kernels based on the runtime compilation feature
+ * introduced in CUDA 7.
+ */
+class OPENMM_EXPORT_CUDACOMPILER CudaRuntimeCompilerKernel : public CudaCompilerKernel {
+public:
+    CudaRuntimeCompilerKernel(std::string name, const Platform& platform) : CudaCompilerKernel(name, platform) {
+    }
+    /**
+     * Compile a kernel to PTX.
+     *
+     * @param source     the source code for the kernel
+     * @param options    the flags to be passed to the compiler
+     * @param cu         the CudaContext for which the kernel is being compiled
+     */
+    std::string createModule(const std::string& source, const std::string& flags, CudaContext& cu);
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CUDACOMPILER_KERNELS_H_*/
--- a/plugins/cudacompiler/tests/CMakeLists.txt
+++ b/plugins/cudacompiler/tests/CMakeLists.txt
+#
+# Testing
+#
+
+ENABLE_TESTING()
+
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIR})
+
+# Automatically create tests using files named "Test*.cpp"
+FILE(GLOB TEST_PROGS "*Test*.cpp")
+FOREACH(TEST_PROG ${TEST_PROGS})
+    GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)
+
+    # Link with shared library
+    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
+    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
+    IF (APPLE)
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    ELSE (APPLE)
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    ENDIF (APPLE)
+    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+
+ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/cudacompiler/tests/TestCudaCompiler.cpp
+++ b/plugins/cudacompiler/tests/TestCudaCompiler.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests using the CUDA runtime compiler plugin to compile kernels.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "ReferencePlatform.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include "openmm/VerletIntegrator.h"
+#include "openmm/internal/ContextImpl.h"
+#include "CudaArray.h"
+#include "CudaNonbondedUtilities.h"
+#include "SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+CudaPlatform platform;
+
+extern "C" void registerCudaCompilerKernelFactories();
+
+/**
+ * A simple test taken from the NonbondedForce test suite.  Make sure it works as
+ * expected when using the runtime compiler.
+ */
+void testCoulomb() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    NonbondedForce* forceField = new NonbondedForce();
+    forceField->addParticle(0.5, 1, 0);
+    forceField->addParticle(-1.5, 1, 0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    double force = ONE_4PI_EPS0*(-0.75)/4.0;
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], 1e-5);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], 1e-5);
+    ASSERT_EQUAL_TOL(ONE_4PI_EPS0*(-0.75)/2.0, state.getPotentialEnergy(), 1e-5);
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        Platform::registerPlatform(&platform);
+        registerCudaCompilerKernelFactories();
+        // Ensure that we won't use cached kernels.
+        platform.setPropertyDefaultValue(CudaPlatform::CudaTempDirectory(), "this does not exist");
+        testCoulomb();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}