Improve selection of architecture to compile for (#3017)

* Improve selection of architecture to compile for * Fixed compilation errors on older CUDA versions

Improve selection of architecture to compile for (#3017)
* Improve selection of architecture to compile for * Fixed compilation errors on older CUDA versions
483239ef · Peter Eastman · GitHub · ebef35a4 · 483239ef · 483239ef
Unverified Commit 483239ef authored Feb 22, 2021 by Peter Eastman Committed by GitHub Feb 22, 2021
5 changed files
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -530,9 +530,10 @@ private:
    int contextIndex;
    int numAtomBlocks;
    int numThreadBlocks;
+    int gpuArchitecture;
    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasCompilerKernel, isNvccAvailable, hasAssignedPosqCharges;
    bool isLinkedContext;
-    std::string compiler, tempDir, cacheDir, gpuArchitecture;
+    std::string compiler, tempDir, cacheDir;
    float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
    double4 periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, periodicBoxSize, invPeriodicBoxSize;
    std::string defaultOptimizationOptions;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -62,6 +62,10 @@ public:
     * @param cu         the CudaContext for which the kernel is being compiled
     */
    virtual std::string createModule(const std::string& source, const std::string& flags, CudaContext& cu) = 0;
+    /**
+     * Get the maximum architecture version the compiler supports.
+     */
+    virtual int getMaxSupportedArchitecture() const = 0;
 };

 /**

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -227,7 +227,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            minor = 3;
        }
    }
-    gpuArchitecture = intToString(major)+intToString(minor);
+    gpuArchitecture = 10*major+minor;
    computeCapability = major+0.1*minor;

    contextIsValid = true;
@@ -531,6 +531,16 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    if (!defines.empty())
        src << endl;
    src << source << endl;
+    
+    // Determine what architecture to compile for.
+    
+    string compileArchitecture;
+    if (hasCompilerKernel) {
+        int maxCompilerArchitecture = compilerKernel.getAs<CudaCompilerKernel>().getMaxSupportedArchitecture();
+        compileArchitecture = intToString(min(gpuArchitecture, maxCompilerArchitecture));
+    }
+    else
+        compileArchitecture = intToString(gpuArchitecture);

    // See whether we already have PTX for this kernel cached.

@@ -544,7 +554,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    cacheFile.flags(ios::hex);
    for (int i = 0; i < 20; i++)
        cacheFile << setw(2) << setfill('0') << (int) hash[i];
-    cacheFile << '_' << gpuArchitecture << '_' << bits;
+    cacheFile << '_' << compileArchitecture << '_' << bits;
    CUmodule module;
    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
        return module;
@@ -566,7 +576,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    // If the runtime compiler plugin is available, use it.

    if (hasCompilerKernel) {
-        string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
+        string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+compileArchitecture+" "+options, *this);

        // If possible, write the PTX out to a temporary file so we can cache it for later use.

@@ -596,13 +606,13 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        out.close();
 #ifdef WIN32
 #ifdef _DEBUG
-        string command = compiler+" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
+        string command = compiler+" --ptx -G -g --machine "+bits+" -arch=sm_"+compileArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
 #else
-        string command = compiler+" --ptx -lineinfo --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
+        string command = compiler+" --ptx -lineinfo --machine "+bits+" -arch=sm_"+compileArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
 #endif
        res = executeInWindows(command);
 #else
-        string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
+        string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+compileArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
        res = std::system(command.c_str());
 #endif
    }

--- a/plugins/cudacompiler/src/CudaCompilerKernels.cpp
+++ b/plugins/cudacompiler/src/CudaCompilerKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2015-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -48,6 +48,22 @@ static string getErrorString(nvrtcResult result) {
    return nvrtcGetErrorString(result);
 }

+CudaRuntimeCompilerKernel::CudaRuntimeCompilerKernel(const std::string& name, const Platform& platform) : CudaCompilerKernel(name, platform) {
+    // Find the maximum architecture the compiler supports.
+    
+#if CUDA_VERSION < 11000
+    // CUDA versions before 11 can't query the compiler to see what it supports.
+    
+    maxSupportedArchitecture = 75;
+#else
+    int numArchs;
+    CHECK_RESULT(nvrtcGetNumSupportedArchs(&numArchs), "Error querying supported architectures");
+    vector<int> archs(numArchs);
+    CHECK_RESULT(nvrtcGetSupportedArchs(archs.data()), "Error querying supported architectures");
+    maxSupportedArchitecture = archs.back();
+#endif
+}
+
 string CudaRuntimeCompilerKernel::createModule(const string& source, const string& flags, CudaContext& cu) {
    // Split the command line flags into an array of options.
    

--- a/plugins/cudacompiler/src/CudaCompilerKernels.h
+++ b/plugins/cudacompiler/src/CudaCompilerKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2015-2021 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -44,8 +44,7 @@ namespace OpenMM {
 */
 class OPENMM_EXPORT_CUDACOMPILER CudaRuntimeCompilerKernel : public CudaCompilerKernel {
 public:
-    CudaRuntimeCompilerKernel(const std::string& name, const Platform& platform) : CudaCompilerKernel(name, platform) {
-    }
+    CudaRuntimeCompilerKernel(const std::string& name, const Platform& platform);
    /**
     * Compile a kernel to PTX.
     *
@@ -54,6 +53,14 @@ public:
     * @param cu         the CudaContext for which the kernel is being compiled
     */
    std::string createModule(const std::string& source, const std::string& flags, CudaContext& cu);
+    /**
+     * Get the maximum architecture version the compiler supports.
+     */
+    int getMaxSupportedArchitecture() const {
+        return maxSupportedArchitecture;
+    }
+private:
+    int maxSupportedArchitecture;
 };

 } // namespace OpenMM