Ensure context can be created when selecting CUDA device

This change is intended to improve the behavior when multiple CUDA contexts are created on a node with multiple CUDA GPUs, each of which is set to process-exclusive mode.

Ensure context can be created when selecting CUDA device
This change is intended to improve the behavior when multiple CUDA contexts are created on a node with multiple CUDA GPUs, each of which is set to process-exclusive mode.
43926c46 · Robert T. McGibbon · Robert McGibbon · b00edc8e · 43926c46 · 43926c46
Commit 43926c46 authored Oct 23, 2015 by Robert T. McGibbon Committed by Robert McGibbon Oct 24, 2015
Hide whitespace changes
Inline Side-by-side

Showing with 99 additions and 56 deletions

platforms/cuda/include/CudaContext.h platforms/cuda/include/CudaContext.h +6 -0

platforms/cuda/src/CudaContext.cpp platforms/cuda/src/CudaContext.cpp +93 -56

No files found.
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -30,6 +30,7 @@
 #include <map>
 #include <queue>
 #include <string>
+#include <utility>
 #define __CL_ENABLE_EXCEPTIONS
 #ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
@@ -538,6 +539,11 @@ public:
     */
    void invalidateMolecules();
 private:
+    /**
+     * Compute a sorted list of device indices in decreasing order of desirability
+     */
+    std::vector<int> getDevicePrecedence();
    struct Molecule;
    struct MoleculeGroup;
    class VirtualSiteInfo;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    if (deviceIndex < -1 || deviceIndex >= numDevices)
        throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
+    vector<int> devicePrecedence;
    if (deviceIndex == -1) {
-        // Try to figure out which device is the fastest.
+        devicePrecedence = getDevicePrecedence();
+    } else {
-        int bestSpeed = -1;
+        devicePrecedence.push_back(deviceIndex);
-        int bestCompute = -1;
+    }
-        for (int i = 0; i < numDevices; i++) {
-            CHECK_RESULT(cuDeviceGet(&device, i));
+    this->deviceIndex = -1;
-            int major, minor, clock, multiprocessors;
+    for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
-            CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
+        deviceIndex = devicePrecedence[i];
-            if (major == 1 && minor < 2)
+        CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
-                continue; // 1.0 and 1.1 are not supported
+        defaultOptimizationOptions = "--use_fast_math";
-            CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
+        unsigned int flags = CU_CTX_MAP_HOST;
-            CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+        if (useBlockingSync)
-            int speed = clock*multiprocessors;
+            flags += CU_CTX_SCHED_BLOCKING_SYNC;
-            if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
+        else
-                deviceIndex = i;
+            flags += CU_CTX_SCHED_SPIN;
-                bestSpeed = speed;
-                bestCompute = major;
+        if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
-            }
+            this->deviceIndex = deviceIndex;
+            break;
        }
    }
-    if (deviceIndex == -1)
+    if (this->deviceIndex == -1)
-        throw OpenMMException("No compatible CUDA device is available");
+        if (devicePrecedence.size() == 1)
-    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
+            throw OpenMMException("The requested CUDA device could not be loaded");
-    this->deviceIndex = deviceIndex;
+        else
+            throw OpenMMException("No compatible CUDA device is available");
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-    // This is a workaround to support GTX 980 with CUDA 6.5.  It reports its compute capability
+#if __CUDA_API_VERSION < 7000
-    // as 5.2, but the compiler doesn't support anything beyond 5.0.  We can remove this once
+        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
-    // CUDA 7.0 is released.
+        // its compute capability as 5.2, but the compiler doesn't support
-    if (major == 5)
+        // anything beyond 5.0.
-        minor = 0;
+        if (major == 5)
+            minor = 0;
+#endif
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;
-    if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
-        throw OpenMMException("This device does not support double precision");
-    defaultOptimizationOptions = "--use_fast_math";
-    unsigned int flags = CU_CTX_MAP_HOST;
-    if (useBlockingSync)
-        flags += CU_CTX_SCHED_BLOCKING_SYNC;
-    else
-        flags += CU_CTX_SCHED_SPIN;
-    CHECK_RESULT(cuCtxCreate(&context, flags, device));
    contextIsValid = true;
    CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
    if (contextIndex > 0) {
@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
    compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
    compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
    // Set defines for applying periodic boundary conditions.
    Vec3 boxVectors[3];
    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    }
    // Create the work thread used for parallelization when running on multiple devices.
    thread = new WorkThread();
    // Create utilities objects.
    bonded = new CudaBondedUtilities(*this);
    nonbonded = new CudaNonbondedUtilities(*this);
    integration = new CudaIntegrationUtilities(*this, system);
@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
            if (index != result.npos) {
                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
                    // We have found a complete symbol, not part of a longer symbol.
                    result.replace(index, size, iter->second);
                    index += iter->second.size();
                }
@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
        return -1;
    }
    WaitForSingleObject(pi.hProcess, INFINITE);
-    DWORD exitCode = -1;  
+    DWORD exitCode = -1;
    if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
        throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
    } else {
-        if(exitCode == 0) 
+        if(exitCode == 0)
            return 0;
        else
            return -1;
@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    if (!defines.empty())
        src << endl;
    src << source << endl;
    // See whether we already have PTX for this kernel cached.
    CSHA1 sha1;
    sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
    sha1.Final();
@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    CUmodule module;
    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
        return module;
    // Select names for the various temporary files.
    stringstream tempFileName;
    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
 #ifdef WIN32
@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    int res = 0;
    // If the runtime compiler plugin is available, use it.
    if (hasCompilerKernel) {
        string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
        // If possible, write the PTX out to a temporary file so we can cache it for later use.
        bool wroteCache = false;
        try {
            ofstream out(outputFile.c_str());
@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        }
        if (!wroteCache) {
            // An error occurred.  Possibly we don't have permission to write to the temp directory.  Just try to load the module directly.
            CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
            return module;
        }
@@ -883,7 +882,7 @@ private:
 void CudaContext::findMoleculeGroups() {
    // The first time this is called, we need to identify all the molecules in the system.
    if (moleculeGroups.size() == 0) {
        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
                        identical = false;
            }
            // See if the constraints are identical.
            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
    }
    if (valid)
        return;
    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
    // again.
    vector<int4> newCellOffsets(numAtoms);
    if (useDoublePrecision) {
        vector<double4> oldPosq(paddedNumAtoms);
@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
       pthread_cond_wait(&queueEmptyCondition, &queueLock);
    pthread_mutex_unlock(&queueLock);
 }
+vector<int> CudaContext::getDevicePrecedence() {
+    int numDevices;
+    CUdevice thisDevice;
+    string errorMessage = "Error initializing Context";
+    vector<pair<pair<int, int>, int> > devices;
+    CHECK_RESULT(cuDeviceGetCount(&numDevices));
+    for (int i = 0; i < numDevices; i++) {
+        CHECK_RESULT(cuDeviceGet(&thisDevice, i));
+        int major, minor, clock, multiprocessors, speed;
+        CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
+        if (major == 1 && minor < 2)
+            continue;
+        if ((useDoublePrecision || useMixedPrecision) && (major*0.1*minor < 1.3))
+            continue;
+        CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
+        CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
+        speed = clock*multiprocessors;
+        pair<int, int> deviceProperties = std::make_pair(major, speed);
+        devices.push_back(std::make_pair(deviceProperties, -i));
+    }
+    // sort first by compute capability (higher is better), then speed
+    // (higher is better), and finally device index (lower is better)
+    std::sort(devices.begin(), devices.end());
+    std::reverse(devices.begin(), devices.end());
+    vector<int> precedence;
+    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
+        precedence.push_back(-devices[i].second);
+    }
+    return precedence;
+}