Commit 43926c46 authored by Robert T. McGibbon's avatar Robert T. McGibbon Committed by Robert McGibbon
Browse files

Ensure context can be created when selecting CUDA device

This change is intended to improve the behavior when multiple
CUDA contexts are created on a node with multiple CUDA GPUs,
each of which is set to process-exclusive mode.
parent b00edc8e
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include <map> #include <map>
#include <queue> #include <queue>
#include <string> #include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS #define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER #ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code. // Prevent Windows from defining macros that interfere with other code.
...@@ -538,6 +539,11 @@ public: ...@@ -538,6 +539,11 @@ public:
*/ */
void invalidateMolecules(); void invalidateMolecules();
private: private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std::vector<int> getDevicePrecedence();
struct Molecule; struct Molecule;
struct MoleculeGroup; struct MoleculeGroup;
class VirtualSiteInfo; class VirtualSiteInfo;
......
...@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT(cuDeviceGetCount(&numDevices)); CHECK_RESULT(cuDeviceGetCount(&numDevices));
if (deviceIndex < -1 || deviceIndex >= numDevices) if (deviceIndex < -1 || deviceIndex >= numDevices)
throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex)); throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
vector<int> devicePrecedence;
if (deviceIndex == -1) { if (deviceIndex == -1) {
// Try to figure out which device is the fastest. devicePrecedence = getDevicePrecedence();
} else {
int bestSpeed = -1; devicePrecedence.push_back(deviceIndex);
int bestCompute = -1; }
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&device, i)); this->deviceIndex = -1;
int major, minor, clock, multiprocessors; for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); deviceIndex = devicePrecedence[i];
if (major == 1 && minor < 2) CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
continue; // 1.0 and 1.1 are not supported defaultOptimizationOptions = "--use_fast_math";
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); unsigned int flags = CU_CTX_MAP_HOST;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); if (useBlockingSync)
int speed = clock*multiprocessors; flags += CU_CTX_SCHED_BLOCKING_SYNC;
if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) { else
deviceIndex = i; flags += CU_CTX_SCHED_SPIN;
bestSpeed = speed;
bestCompute = major; if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
} this->deviceIndex = deviceIndex;
break;
} }
} }
if (deviceIndex == -1) if (this->deviceIndex == -1)
throw OpenMMException("No compatible CUDA device is available"); if (devicePrecedence.size() == 1)
CHECK_RESULT(cuDeviceGet(&device, deviceIndex)); throw OpenMMException("The requested CUDA device could not be loaded");
this->deviceIndex = deviceIndex; else
throw OpenMMException("No compatible CUDA device is available");
int major, minor; int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability #if __CUDA_API_VERSION < 7000
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once // This is a workaround to support GTX 980 with CUDA 6.5. It reports
// CUDA 7.0 is released. // its compute capability as 5.2, but the compiler doesn't support
if (major == 5) // anything beyond 5.0.
minor = 0; if (major == 5)
minor = 0;
#endif
gpuArchitecture = intToString(major)+intToString(minor); gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor; computeCapability = major+0.1*minor;
if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
throw OpenMMException("This device does not support double precision");
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true; contextIsValid = true;
CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED)); CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
if (contextIndex > 0) { if (contextIndex > 0) {
...@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf"; compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff"; compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf"; compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions. // Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3]; Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]); system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 || boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
...@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
} }
// Create the work thread used for parallelization when running on multiple devices. // Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread(); thread = new WorkThread();
// Create utilities objects. // Create utilities objects.
bonded = new CudaBondedUtilities(*this); bonded = new CudaBondedUtilities(*this);
nonbonded = new CudaNonbondedUtilities(*this); nonbonded = new CudaNonbondedUtilities(*this);
integration = new CudaIntegrationUtilities(*this, system); integration = new CudaIntegrationUtilities(*this, system);
...@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri ...@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
if (index != result.npos) { if (index != result.npos) {
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) { if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol. // We have found a complete symbol, not part of a longer symbol.
result.replace(index, size, iter->second); result.replace(index, size, iter->second);
index += iter->second.size(); index += iter->second.size();
} }
...@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) { ...@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
return -1; return -1;
} }
WaitForSingleObject(pi.hProcess, INFINITE); WaitForSingleObject(pi.hProcess, INFINITE);
DWORD exitCode = -1; DWORD exitCode = -1;
if(!GetExitCodeProcess(pi.hProcess, &exitCode)) { if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
throw(OpenMMException("Could not get nvcc.exe's exit code\n")); throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
} else { } else {
if(exitCode == 0) if(exitCode == 0)
return 0; return 0;
else else
return -1; return -1;
...@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if (!defines.empty()) if (!defines.empty())
src << endl; src << endl;
src << source << endl; src << source << endl;
// See whether we already have PTX for this kernel cached. // See whether we already have PTX for this kernel cached.
CSHA1 sha1; CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size()); sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final(); sha1.Final();
...@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUmodule module; CUmodule module;
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS) if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module; return module;
// Select names for the various temporary files. // Select names for the various temporary files.
stringstream tempFileName; stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions. tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32 #ifdef WIN32
...@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int res = 0; int res = 0;
// If the runtime compiler plugin is available, use it. // If the runtime compiler plugin is available, use it.
if (hasCompilerKernel) { if (hasCompilerKernel) {
string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this); string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
// If possible, write the PTX out to a temporary file so we can cache it for later use. // If possible, write the PTX out to a temporary file so we can cache it for later use.
bool wroteCache = false; bool wroteCache = false;
try { try {
ofstream out(outputFile.c_str()); ofstream out(outputFile.c_str());
...@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
} }
if (!wroteCache) { if (!wroteCache) {
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly. // An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module"); CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
return module; return module;
} }
...@@ -883,7 +882,7 @@ private: ...@@ -883,7 +882,7 @@ private:
void CudaContext::findMoleculeGroups() { void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system. // The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) { if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites. // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
...@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() { ...@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i])) if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false; identical = false;
} }
// See if the constraints are identical. // See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) { for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
...@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() { ...@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
} }
if (valid) if (valid)
return; return;
// The list of which molecules are identical is no longer valid. We need to restore the // The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them // atoms to their original order, rebuild the list of identical molecules, and sort them
// again. // again.
vector<int4> newCellOffsets(numAtoms); vector<int4> newCellOffsets(numAtoms);
if (useDoublePrecision) { if (useDoublePrecision) {
vector<double4> oldPosq(paddedNumAtoms); vector<double4> oldPosq(paddedNumAtoms);
...@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() { ...@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
pthread_cond_wait(&queueEmptyCondition, &queueLock); pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock); pthread_mutex_unlock(&queueLock);
} }
vector<int> CudaContext::getDevicePrecedence() {
int numDevices;
CUdevice thisDevice;
string errorMessage = "Error initializing Context";
vector<pair<pair<int, int>, int> > devices;
CHECK_RESULT(cuDeviceGetCount(&numDevices));
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&thisDevice, i));
int major, minor, clock, multiprocessors, speed;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
if (major == 1 && minor < 2)
continue;
if ((useDoublePrecision || useMixedPrecision) && (major*0.1*minor < 1.3))
continue;
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
speed = clock*multiprocessors;
pair<int, int> deviceProperties = std::make_pair(major, speed);
devices.push_back(std::make_pair(deviceProperties, -i));
}
// sort first by compute capability (higher is better), then speed
// (higher is better), and finally device index (lower is better)
std::sort(devices.begin(), devices.end());
std::reverse(devices.begin(), devices.end());
vector<int> precedence;
for (int i = 0; i < static_cast<int>(devices.size()); i++) {
precedence.push_back(-devices[i].second);
}
return precedence;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment