Commit 433ca1ea authored by peastman's avatar peastman
Browse files

Merge pull request #1205 from rmcgibbo/which-device

Ensure context can be created when selecting CUDA device
parents dddc9e45 eb5a680d
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include <map> #include <map>
#include <queue> #include <queue>
#include <string> #include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS #define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER #ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code. // Prevent Windows from defining macros that interfere with other code.
...@@ -538,6 +539,11 @@ public: ...@@ -538,6 +539,11 @@ public:
*/ */
void invalidateMolecules(); void invalidateMolecules();
private: private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std::vector<int> getDevicePrecedence();
struct Molecule; struct Molecule;
struct MoleculeGroup; struct MoleculeGroup;
class VirtualSiteInfo; class VirtualSiteInfo;
......
...@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT(cuDeviceGetCount(&numDevices)); CHECK_RESULT(cuDeviceGetCount(&numDevices));
if (deviceIndex < -1 || deviceIndex >= numDevices) if (deviceIndex < -1 || deviceIndex >= numDevices)
throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex)); throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
vector<int> devicePrecedence;
if (deviceIndex == -1) { if (deviceIndex == -1) {
// Try to figure out which device is the fastest. devicePrecedence = getDevicePrecedence();
} else {
int bestSpeed = -1; devicePrecedence.push_back(deviceIndex);
int bestCompute = -1; }
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&device, i)); this->deviceIndex = -1;
int major, minor, clock, multiprocessors; for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); int trialDeviceIndex = devicePrecedence[i];
if (major == 1 && minor < 2) CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
continue; // 1.0 and 1.1 are not supported defaultOptimizationOptions = "--use_fast_math";
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); unsigned int flags = CU_CTX_MAP_HOST;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); if (useBlockingSync)
int speed = clock*multiprocessors; flags += CU_CTX_SCHED_BLOCKING_SYNC;
if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) { else
deviceIndex = i; flags += CU_CTX_SCHED_SPIN;
bestSpeed = speed;
bestCompute = major; if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
} this->deviceIndex = trialDeviceIndex;
break;
} }
} }
if (deviceIndex == -1) if (this->deviceIndex == -1)
throw OpenMMException("No compatible CUDA device is available"); if (deviceIndex != -1)
CHECK_RESULT(cuDeviceGet(&device, deviceIndex)); throw OpenMMException("The requested CUDA device could not be loaded");
this->deviceIndex = deviceIndex; else
throw OpenMMException("No compatible CUDA device is available");
int major, minor; int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability #if __CUDA_API_VERSION < 7000
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once // This is a workaround to support GTX 980 with CUDA 6.5. It reports
// CUDA 7.0 is released. // its compute capability as 5.2, but the compiler doesn't support
if (major == 5) // anything beyond 5.0.
minor = 0; if (major == 5)
minor = 0;
#endif
gpuArchitecture = intToString(major)+intToString(minor); gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor; computeCapability = major+0.1*minor;
if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
throw OpenMMException("This device does not support double precision");
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true; contextIsValid = true;
CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED)); CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
if (contextIndex > 0) { if (contextIndex > 0) {
...@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf"; compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff"; compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf"; compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions. // Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3]; Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]); system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 || boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
...@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
} }
// Create the work thread used for parallelization when running on multiple devices. // Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread(); thread = new WorkThread();
// Create utilities objects. // Create utilities objects.
bonded = new CudaBondedUtilities(*this); bonded = new CudaBondedUtilities(*this);
nonbonded = new CudaNonbondedUtilities(*this); nonbonded = new CudaNonbondedUtilities(*this);
integration = new CudaIntegrationUtilities(*this, system); integration = new CudaIntegrationUtilities(*this, system);
...@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri ...@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
if (index != result.npos) { if (index != result.npos) {
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) { if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol. // We have found a complete symbol, not part of a longer symbol.
result.replace(index, size, iter->second); result.replace(index, size, iter->second);
index += iter->second.size(); index += iter->second.size();
} }
...@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) { ...@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
return -1; return -1;
} }
WaitForSingleObject(pi.hProcess, INFINITE); WaitForSingleObject(pi.hProcess, INFINITE);
DWORD exitCode = -1; DWORD exitCode = -1;
if(!GetExitCodeProcess(pi.hProcess, &exitCode)) { if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
throw(OpenMMException("Could not get nvcc.exe's exit code\n")); throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
} else { } else {
if(exitCode == 0) if(exitCode == 0)
return 0; return 0;
else else
return -1; return -1;
...@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if (!defines.empty()) if (!defines.empty())
src << endl; src << endl;
src << source << endl; src << source << endl;
// See whether we already have PTX for this kernel cached. // See whether we already have PTX for this kernel cached.
CSHA1 sha1; CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size()); sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final(); sha1.Final();
...@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUmodule module; CUmodule module;
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS) if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module; return module;
// Select names for the various temporary files. // Select names for the various temporary files.
stringstream tempFileName; stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions. tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32 #ifdef WIN32
...@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int res = 0; int res = 0;
// If the runtime compiler plugin is available, use it. // If the runtime compiler plugin is available, use it.
if (hasCompilerKernel) { if (hasCompilerKernel) {
string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this); string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
// If possible, write the PTX out to a temporary file so we can cache it for later use. // If possible, write the PTX out to a temporary file so we can cache it for later use.
bool wroteCache = false; bool wroteCache = false;
try { try {
ofstream out(outputFile.c_str()); ofstream out(outputFile.c_str());
...@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
} }
if (!wroteCache) { if (!wroteCache) {
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly. // An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module"); CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
return module; return module;
} }
...@@ -883,7 +882,7 @@ private: ...@@ -883,7 +882,7 @@ private:
void CudaContext::findMoleculeGroups() { void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system. // The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) { if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites. // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
...@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() { ...@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i])) if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false; identical = false;
} }
// See if the constraints are identical. // See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) { for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
...@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() { ...@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
} }
if (valid) if (valid)
return; return;
// The list of which molecules are identical is no longer valid. We need to restore the // The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them // atoms to their original order, rebuild the list of identical molecules, and sort them
// again. // again.
vector<int4> newCellOffsets(numAtoms); vector<int4> newCellOffsets(numAtoms);
if (useDoublePrecision) { if (useDoublePrecision) {
vector<double4> oldPosq(paddedNumAtoms); vector<double4> oldPosq(paddedNumAtoms);
...@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() { ...@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
pthread_cond_wait(&queueEmptyCondition, &queueLock); pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock); pthread_mutex_unlock(&queueLock);
} }
vector<int> CudaContext::getDevicePrecedence() {
int numDevices;
CUdevice thisDevice;
string errorMessage = "Error initializing Context";
vector<pair<pair<int, int>, int> > devices;
CHECK_RESULT(cuDeviceGetCount(&numDevices));
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&thisDevice, i));
int major, minor, clock, multiprocessors, speed;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
if (major == 1 && minor < 2)
continue;
if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
continue;
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
speed = clock*multiprocessors;
pair<int, int> deviceProperties = std::make_pair(major, speed);
devices.push_back(std::make_pair(deviceProperties, -i));
}
// sort first by compute capability (higher is better), then speed
// (higher is better), and finally device index (lower is better)
std::sort(devices.begin(), devices.end());
std::reverse(devices.begin(), devices.end());
vector<int> precedence;
for (int i = 0; i < static_cast<int>(devices.size()); i++) {
precedence.push_back(-devices[i].second);
}
return precedence;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment