Ensure context can be created when selecting CUDA device

This change is intended to improve the behavior when multiple CUDA contexts are created on a node with multiple CUDA GPUs, each of which is set to process-exclusive mode.

Ensure context can be created when selecting CUDA device
This change is intended to improve the behavior when multiple CUDA contexts are created on a node with multiple CUDA GPUs, each of which is set to process-exclusive mode.
43926c46 · Robert T. McGibbon · Robert McGibbon · b00edc8e · 43926c46 · 43926c46
Commit 43926c46 authored Oct 23, 2015 by Robert T. McGibbon Committed by Robert McGibbon Oct 24, 2015
Show whitespace changes
Inline Side-by-side

Showing with 99 additions and 56 deletions

platforms/cuda/include/CudaContext.h platforms/cuda/include/CudaContext.h +6 -0

platforms/cuda/src/CudaContext.cpp platforms/cuda/src/CudaContext.cpp +93 -56

No files found.
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -30,6 +30,7 @@
 #include <map>
 #include <queue>
 #include <string>
+#include <utility>
 #define __CL_ENABLE_EXCEPTIONS
 #ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
@@ -538,6 +539,11 @@ public:
     */
    void invalidateMolecules();
 private:
+    /**
+     * Compute a sorted list of device indices in decreasing order of desirability
+     */
+    std::vector<int> getDevicePrecedence();
+
    struct Molecule;
    struct MoleculeGroup;
    class VirtualSiteInfo;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    if (deviceIndex < -1 || deviceIndex >= numDevices)
        throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
-    if (deviceIndex == -1) {
-        // Try to figure out which device is the fastest.

-        int bestSpeed = -1;
-        int bestCompute = -1;
-        for (int i = 0; i < numDevices; i++) {
-            CHECK_RESULT(cuDeviceGet(&device, i));
-            int major, minor, clock, multiprocessors;
-            CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-            if (major == 1 && minor < 2)
-                continue; // 1.0 and 1.1 are not supported
-            CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
-            CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
-            int speed = clock*multiprocessors;
-            if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
-                deviceIndex = i;
-                bestSpeed = speed;
-                bestCompute = major;
+    vector<int> devicePrecedence;
+    if (deviceIndex == -1) {
+        devicePrecedence = getDevicePrecedence();
+    } else {
+        devicePrecedence.push_back(deviceIndex);
    }
+
+    this->deviceIndex = -1;
+    for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
+        deviceIndex = devicePrecedence[i];
+        CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
+        defaultOptimizationOptions = "--use_fast_math";
+        unsigned int flags = CU_CTX_MAP_HOST;
+        if (useBlockingSync)
+            flags += CU_CTX_SCHED_BLOCKING_SYNC;
+        else
+            flags += CU_CTX_SCHED_SPIN;
+
+        if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
+            this->deviceIndex = deviceIndex;
+            break;
        }
    }
-    if (deviceIndex == -1)
+    if (this->deviceIndex == -1)
+        if (devicePrecedence.size() == 1)
+            throw OpenMMException("The requested CUDA device could not be loaded");
+        else
            throw OpenMMException("No compatible CUDA device is available");
-    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
-    this->deviceIndex = deviceIndex;
+
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-    // This is a workaround to support GTX 980 with CUDA 6.5.  It reports its compute capability
-    // as 5.2, but the compiler doesn't support anything beyond 5.0.  We can remove this once
-    // CUDA 7.0 is released.
+#if __CUDA_API_VERSION < 7000
+        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
+        // its compute capability as 5.2, but the compiler doesn't support
+        // anything beyond 5.0.
        if (major == 5)
            minor = 0;
+#endif
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;
-    if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
-        throw OpenMMException("This device does not support double precision");
-    defaultOptimizationOptions = "--use_fast_math";
-    unsigned int flags = CU_CTX_MAP_HOST;
-    if (useBlockingSync)
-        flags += CU_CTX_SCHED_BLOCKING_SYNC;
-    else
-        flags += CU_CTX_SCHED_SPIN;
-    CHECK_RESULT(cuCtxCreate(&context, flags, device));
+
    contextIsValid = true;
    CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
    if (contextIndex > 0) {
@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
       pthread_cond_wait(&queueEmptyCondition, &queueLock);
    pthread_mutex_unlock(&queueLock);
 }
+
+
+vector<int> CudaContext::getDevicePrecedence() {
+    int numDevices;
+    CUdevice thisDevice;
+    string errorMessage = "Error initializing Context";
+    vector<pair<pair<int, int>, int> > devices;
+
+    CHECK_RESULT(cuDeviceGetCount(&numDevices));
+    for (int i = 0; i < numDevices; i++) {
+        CHECK_RESULT(cuDeviceGet(&thisDevice, i));
+        int major, minor, clock, multiprocessors, speed;
+        CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
+        if (major == 1 && minor < 2)
+            continue;
+
+        if ((useDoublePrecision || useMixedPrecision) && (major*0.1*minor < 1.3))
+            continue;
+
+        CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
+        CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
+        speed = clock*multiprocessors;
+        pair<int, int> deviceProperties = std::make_pair(major, speed);
+        devices.push_back(std::make_pair(deviceProperties, -i));
+    }
+
+    // sort first by compute capability (higher is better), then speed
+    // (higher is better), and finally device index (lower is better)
+    std::sort(devices.begin(), devices.end());
+    std::reverse(devices.begin(), devices.end());
+
+    vector<int> precedence;
+    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
+        precedence.push_back(-devices[i].second);
+    }
+
+    return precedence;
+}