Merge pull request #1205 from rmcgibbo/which-device

Ensure context can be created when selecting CUDA device

Merge pull request #1205 from rmcgibbo/which-device
Ensure context can be created when selecting CUDA device
433ca1ea · peastman · dddc9e45 · eb5a680d · 433ca1ea · 433ca1ea
Commit 433ca1ea authored Oct 27, 2015 by peastman
Show whitespace changes
Inline Side-by-side

Showing with 99 additions and 56 deletions

platforms/cuda/include/CudaContext.h platforms/cuda/include/CudaContext.h +6 -0

platforms/cuda/src/CudaContext.cpp platforms/cuda/src/CudaContext.cpp +93 -56

No files found.
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -30,6 +30,7 @@
 #include <map>
 #include <queue>
 #include <string>
+#include <utility>
 #define __CL_ENABLE_EXCEPTIONS
 #ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
@@ -538,6 +539,11 @@ public:
     */
    void invalidateMolecules();
 private:
+    /**
+     * Compute a sorted list of device indices in decreasing order of desirability
+     */
+    std::vector<int> getDevicePrecedence();
    struct Molecule;
    struct MoleculeGroup;
    class VirtualSiteInfo;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    if (deviceIndex < -1 || deviceIndex >= numDevices)
        throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
-    if (deviceIndex == -1) {
-        // Try to figure out which device is the fastest.
-        int bestSpeed = -1;
+    vector<int> devicePrecedence;
-        int bestCompute = -1;
+    if (deviceIndex == -1) {
-        for (int i = 0; i < numDevices; i++) {
+        devicePrecedence = getDevicePrecedence();
-            CHECK_RESULT(cuDeviceGet(&device, i));
+    } else {
-            int major, minor, clock, multiprocessors;
+        devicePrecedence.push_back(deviceIndex);
-            CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-            if (major == 1 && minor < 2)
-                continue; // 1.0 and 1.1 are not supported
-            CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
-            CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
-            int speed = clock*multiprocessors;
-            if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
-                deviceIndex = i;
-                bestSpeed = speed;
-                bestCompute = major;
    }
+    this->deviceIndex = -1;
+    for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
+        int trialDeviceIndex = devicePrecedence[i];
+        CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
+        defaultOptimizationOptions = "--use_fast_math";
+        unsigned int flags = CU_CTX_MAP_HOST;
+        if (useBlockingSync)
+            flags += CU_CTX_SCHED_BLOCKING_SYNC;
+        else
+            flags += CU_CTX_SCHED_SPIN;
+        if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
+            this->deviceIndex = trialDeviceIndex;
+            break;
        }
    }
-    if (deviceIndex == -1)
+    if (this->deviceIndex == -1)
+        if (deviceIndex != -1)
+            throw OpenMMException("The requested CUDA device could not be loaded");
+        else
            throw OpenMMException("No compatible CUDA device is available");
-    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
-    this->deviceIndex = deviceIndex;
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-    // This is a workaround to support GTX 980 with CUDA 6.5.  It reports its compute capability
+#if __CUDA_API_VERSION < 7000
-    // as 5.2, but the compiler doesn't support anything beyond 5.0.  We can remove this once
+        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
-    // CUDA 7.0 is released.
+        // its compute capability as 5.2, but the compiler doesn't support
+        // anything beyond 5.0.
        if (major == 5)
            minor = 0;
+#endif
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;
-    if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
-        throw OpenMMException("This device does not support double precision");
-    defaultOptimizationOptions = "--use_fast_math";
-    unsigned int flags = CU_CTX_MAP_HOST;
-    if (useBlockingSync)
-        flags += CU_CTX_SCHED_BLOCKING_SYNC;
-    else
-        flags += CU_CTX_SCHED_SPIN;
-    CHECK_RESULT(cuCtxCreate(&context, flags, device));
    contextIsValid = true;
    CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
    if (contextIndex > 0) {
@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
       pthread_cond_wait(&queueEmptyCondition, &queueLock);
    pthread_mutex_unlock(&queueLock);
 }
+vector<int> CudaContext::getDevicePrecedence() {
+    int numDevices;
+    CUdevice thisDevice;
+    string errorMessage = "Error initializing Context";
+    vector<pair<pair<int, int>, int> > devices;
+    CHECK_RESULT(cuDeviceGetCount(&numDevices));
+    for (int i = 0; i < numDevices; i++) {
+        CHECK_RESULT(cuDeviceGet(&thisDevice, i));
+        int major, minor, clock, multiprocessors, speed;
+        CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
+        if (major == 1 && minor < 2)
+            continue;
+        if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
+            continue;
+        CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
+        CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
+        speed = clock*multiprocessors;
+        pair<int, int> deviceProperties = std::make_pair(major, speed);
+        devices.push_back(std::make_pair(deviceProperties, -i));
+    }
+    // sort first by compute capability (higher is better), then speed
+    // (higher is better), and finally device index (lower is better)
+    std::sort(devices.begin(), devices.end());
+    std::reverse(devices.begin(), devices.end());
+    vector<int> precedence;
+    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
+        precedence.push_back(-devices[i].second);
+    }
+    return precedence;
+}