Commit 433ca1ea authored by peastman's avatar peastman
Browse files

Merge pull request #1205 from rmcgibbo/which-device

Ensure context can be created when selecting CUDA device
parents dddc9e45 eb5a680d
......@@ -30,6 +30,7 @@
#include <map>
#include <queue>
#include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code.
......@@ -538,6 +539,11 @@ public:
*/
void invalidateMolecules();
private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std::vector<int> getDevicePrecedence();
struct Molecule;
struct MoleculeGroup;
class VirtualSiteInfo;
......
......@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT(cuDeviceGetCount(&numDevices));
if (deviceIndex < -1 || deviceIndex >= numDevices)
throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
vector<int> devicePrecedence;
if (deviceIndex == -1) {
// Try to figure out which device is the fastest.
int bestSpeed = -1;
int bestCompute = -1;
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&device, i));
int major, minor, clock, multiprocessors;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
if (major == 1 && minor < 2)
continue; // 1.0 and 1.1 are not supported
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int speed = clock*multiprocessors;
if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
deviceIndex = i;
bestSpeed = speed;
bestCompute = major;
}
devicePrecedence = getDevicePrecedence();
} else {
devicePrecedence.push_back(deviceIndex);
}
this->deviceIndex = -1;
for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
this->deviceIndex = trialDeviceIndex;
break;
}
}
if (deviceIndex == -1)
throw OpenMMException("No compatible CUDA device is available");
CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
this->deviceIndex = deviceIndex;
if (this->deviceIndex == -1)
if (deviceIndex != -1)
throw OpenMMException("The requested CUDA device could not be loaded");
else
throw OpenMMException("No compatible CUDA device is available");
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once
// CUDA 7.0 is released.
if (major == 5)
minor = 0;
#if __CUDA_API_VERSION < 7000
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// its compute capability as 5.2, but the compiler doesn't support
// anything beyond 5.0.
if (major == 5)
minor = 0;
#endif
gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor;
if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
throw OpenMMException("This device does not support double precision");
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true;
CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
if (contextIndex > 0) {
......@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
......@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
}
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
// Create utilities objects.
bonded = new CudaBondedUtilities(*this);
nonbonded = new CudaNonbondedUtilities(*this);
integration = new CudaIntegrationUtilities(*this, system);
......@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
if (index != result.npos) {
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol.
result.replace(index, size, iter->second);
index += iter->second.size();
}
......@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
return -1;
}
WaitForSingleObject(pi.hProcess, INFINITE);
DWORD exitCode = -1;
DWORD exitCode = -1;
if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
} else {
if(exitCode == 0)
if(exitCode == 0)
return 0;
else
return -1;
......@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if (!defines.empty())
src << endl;
src << source << endl;
// See whether we already have PTX for this kernel cached.
CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final();
......@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUmodule module;
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module;
// Select names for the various temporary files.
stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32
......@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int res = 0;
// If the runtime compiler plugin is available, use it.
if (hasCompilerKernel) {
string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool wroteCache = false;
try {
ofstream out(outputFile.c_str());
......@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
}
if (!wroteCache) {
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
return module;
}
......@@ -883,7 +882,7 @@ private:
void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
......@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false;
}
// See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
......@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
}
if (valid)
return;
// The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector<int4> newCellOffsets(numAtoms);
if (useDoublePrecision) {
vector<double4> oldPosq(paddedNumAtoms);
......@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock);
}
vector<int> CudaContext::getDevicePrecedence() {
int numDevices;
CUdevice thisDevice;
string errorMessage = "Error initializing Context";
vector<pair<pair<int, int>, int> > devices;
CHECK_RESULT(cuDeviceGetCount(&numDevices));
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&thisDevice, i));
int major, minor, clock, multiprocessors, speed;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
if (major == 1 && minor < 2)
continue;
if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
continue;
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
speed = clock*multiprocessors;
pair<int, int> deviceProperties = std::make_pair(major, speed);
devices.push_back(std::make_pair(deviceProperties, -i));
}
// sort first by compute capability (higher is better), then speed
// (higher is better), and finally device index (lower is better)
std::sort(devices.begin(), devices.end());
std::reverse(devices.begin(), devices.end());
vector<int> precedence;
for (int i = 0; i < static_cast<int>(devices.size()); i++) {
precedence.push_back(-devices[i].second);
}
return precedence;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment