Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
433ca1ea
Commit
433ca1ea
authored
Oct 27, 2015
by
peastman
Browse files
Merge pull request #1205 from rmcgibbo/which-device
Ensure context can be created when selecting CUDA device
parents
dddc9e45
eb5a680d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
99 additions
and
56 deletions
+99
-56
platforms/cuda/include/CudaContext.h
platforms/cuda/include/CudaContext.h
+6
-0
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+93
-56
No files found.
platforms/cuda/include/CudaContext.h
View file @
433ca1ea
...
...
@@ -30,6 +30,7 @@
#include <map>
#include <queue>
#include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code.
...
...
@@ -538,6 +539,11 @@ public:
*/
void
invalidateMolecules
();
private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std
::
vector
<
int
>
getDevicePrecedence
();
struct
Molecule
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
...
...
platforms/cuda/src/CudaContext.cpp
View file @
433ca1ea
...
...
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT
(
cuDeviceGetCount
(
&
numDevices
));
if
(
deviceIndex
<
-
1
||
deviceIndex
>=
numDevices
)
throw
OpenMMException
(
"Illegal value for CudaDeviceIndex: "
+
intToString
(
deviceIndex
));
vector
<
int
>
devicePrecedence
;
if
(
deviceIndex
==
-
1
)
{
// Try to figure out which device is the fastest.
int
bestSpeed
=
-
1
;
int
bestCompute
=
-
1
;
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
i
));
int
major
,
minor
,
clock
,
multiprocessors
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
if
(
major
==
1
&&
minor
<
2
)
continue
;
// 1.0 and 1.1 are not supported
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
clock
,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE
,
device
));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
device
));
int
speed
=
clock
*
multiprocessors
;
if
(
major
>
bestCompute
||
(
major
==
bestCompute
&&
speed
>
bestSpeed
))
{
deviceIndex
=
i
;
bestSpeed
=
speed
;
bestCompute
=
major
;
}
devicePrecedence
=
getDevicePrecedence
();
}
else
{
devicePrecedence
.
push_back
(
deviceIndex
);
}
this
->
deviceIndex
=
-
1
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
devicePrecedence
.
size
());
i
++
)
{
int
trialDeviceIndex
=
devicePrecedence
[
i
];
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
trialDeviceIndex
));
defaultOptimizationOptions
=
"--use_fast_math"
;
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
if
(
useBlockingSync
)
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
else
flags
+=
CU_CTX_SCHED_SPIN
;
if
(
cuCtxCreate
(
&
context
,
flags
,
device
)
==
CUDA_SUCCESS
)
{
this
->
deviceIndex
=
trialDeviceIndex
;
break
;
}
}
if
(
deviceIndex
==
-
1
)
throw
OpenMMException
(
"No compatible CUDA device is available"
);
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
deviceIndex
));
this
->
deviceIndex
=
deviceIndex
;
if
(
this
->
deviceIndex
==
-
1
)
if
(
deviceIndex
!=
-
1
)
throw
OpenMMException
(
"The requested CUDA device could not be loaded"
);
else
throw
OpenMMException
(
"No compatible CUDA device is available"
);
int
major
,
minor
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once
// CUDA 7.0 is released.
if
(
major
==
5
)
minor
=
0
;
#if __CUDA_API_VERSION < 7000
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// its compute capability as 5.2, but the compiler doesn't support
// anything beyond 5.0.
if
(
major
==
5
)
minor
=
0
;
#endif
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
computeCapability
=
major
+
0.1
*
minor
;
if
((
useDoublePrecision
||
useMixedPrecision
)
&&
computeCapability
<
1.3
)
throw
OpenMMException
(
"This device does not support double precision"
);
defaultOptimizationOptions
=
"--use_fast_math"
;
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
if
(
useBlockingSync
)
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
else
flags
+=
CU_CTX_SCHED_SPIN
;
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
CHECK_RESULT
(
cuCtxSetCacheConfig
(
CU_FUNC_CACHE_PREFER_SHARED
));
if
(
contextIndex
>
0
)
{
...
...
@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines
[
"ATAN"
]
=
useDoublePrecision
?
"atan"
:
"atanf"
;
compilationDefines
[
"ERF"
]
=
useDoublePrecision
?
"erf"
:
"erff"
;
compilationDefines
[
"ERFC"
]
=
useDoublePrecision
?
"erfc"
:
"erfcf"
;
// Set defines for applying periodic boundary conditions.
Vec3
boxVectors
[
3
];
system
.
getDefaultPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
boxIsTriclinic
=
(
boxVectors
[
0
][
1
]
!=
0.0
||
boxVectors
[
0
][
2
]
!=
0.0
||
...
...
@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
}
// Create the work thread used for parallelization when running on multiple devices.
thread
=
new
WorkThread
();
// Create utilities objects.
bonded
=
new
CudaBondedUtilities
(
*
this
);
nonbonded
=
new
CudaNonbondedUtilities
(
*
this
);
integration
=
new
CudaIntegrationUtilities
(
*
this
,
system
);
...
...
@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
if
(
index
!=
result
.
npos
)
{
if
((
index
==
0
||
symbolChars
.
find
(
result
[
index
-
1
])
==
symbolChars
.
end
())
&&
(
index
==
result
.
size
()
-
size
||
symbolChars
.
find
(
result
[
index
+
size
])
==
symbolChars
.
end
()))
{
// We have found a complete symbol, not part of a longer symbol.
result
.
replace
(
index
,
size
,
iter
->
second
);
index
+=
iter
->
second
.
size
();
}
...
...
@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
return
-
1
;
}
WaitForSingleObject
(
pi
.
hProcess
,
INFINITE
);
DWORD
exitCode
=
-
1
;
DWORD
exitCode
=
-
1
;
if
(
!
GetExitCodeProcess
(
pi
.
hProcess
,
&
exitCode
))
{
throw
(
OpenMMException
(
"Could not get nvcc.exe's exit code
\n
"
));
}
else
{
if
(
exitCode
==
0
)
if
(
exitCode
==
0
)
return
0
;
else
return
-
1
;
...
...
@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if
(
!
defines
.
empty
())
src
<<
endl
;
src
<<
source
<<
endl
;
// See whether we already have PTX for this kernel cached.
CSHA1
sha1
;
sha1
.
Update
((
const
UINT_8
*
)
src
.
str
().
c_str
(),
src
.
str
().
size
());
sha1
.
Final
();
...
...
@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUmodule
module
;
if
(
cuModuleLoad
(
&
module
,
cacheFile
.
str
().
c_str
())
==
CUDA_SUCCESS
)
return
module
;
// Select names for the various temporary files.
stringstream
tempFileName
;
tempFileName
<<
"openmmTempKernel"
<<
this
;
// Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32
...
...
@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int
res
=
0
;
// If the runtime compiler plugin is available, use it.
if
(
hasCompilerKernel
)
{
string
ptx
=
compilerKernel
.
getAs
<
CudaCompilerKernel
>
().
createModule
(
src
.
str
(),
"-arch=compute_"
+
gpuArchitecture
+
" "
+
options
,
*
this
);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool
wroteCache
=
false
;
try
{
ofstream
out
(
outputFile
.
c_str
());
...
...
@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
}
if
(
!
wroteCache
)
{
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2
(
cuModuleLoadDataEx
(
&
module
,
&
ptx
[
0
],
0
,
NULL
,
NULL
),
"Error loading CUDA module"
);
return
module
;
}
...
...
@@ -883,7 +882,7 @@ private:
void
CudaContext
::
findMoleculeGroups
()
{
// The first time this is called, we need to identify all the molecules in the system.
if
(
moleculeGroups
.
size
()
==
0
)
{
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
...
...
@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
if
(
!
forces
[
k
]
->
areParticlesIdentical
(
mol
.
atoms
[
i
],
mol2
.
atoms
[
i
]))
identical
=
false
;
}
// See if the constraints are identical.
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
{
...
...
@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
}
if
(
valid
)
return
;
// The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector
<
int4
>
newCellOffsets
(
numAtoms
);
if
(
useDoublePrecision
)
{
vector
<
double4
>
oldPosq
(
paddedNumAtoms
);
...
...
@@ -1393,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
pthread_cond_wait
(
&
queueEmptyCondition
,
&
queueLock
);
pthread_mutex_unlock
(
&
queueLock
);
}
vector
<
int
>
CudaContext
::
getDevicePrecedence
()
{
int
numDevices
;
CUdevice
thisDevice
;
string
errorMessage
=
"Error initializing Context"
;
vector
<
pair
<
pair
<
int
,
int
>
,
int
>
>
devices
;
CHECK_RESULT
(
cuDeviceGetCount
(
&
numDevices
));
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
CHECK_RESULT
(
cuDeviceGet
(
&
thisDevice
,
i
));
int
major
,
minor
,
clock
,
multiprocessors
,
speed
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
thisDevice
));
if
(
major
==
1
&&
minor
<
2
)
continue
;
if
((
useDoublePrecision
||
useMixedPrecision
)
&&
(
major
+
0.1
*
minor
<
1.3
))
continue
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
clock
,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE
,
thisDevice
));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
thisDevice
));
speed
=
clock
*
multiprocessors
;
pair
<
int
,
int
>
deviceProperties
=
std
::
make_pair
(
major
,
speed
);
devices
.
push_back
(
std
::
make_pair
(
deviceProperties
,
-
i
));
}
// sort first by compute capability (higher is better), then speed
// (higher is better), and finally device index (lower is better)
std
::
sort
(
devices
.
begin
(),
devices
.
end
());
std
::
reverse
(
devices
.
begin
(),
devices
.
end
());
vector
<
int
>
precedence
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
devices
.
size
());
i
++
)
{
precedence
.
push_back
(
-
devices
[
i
].
second
);
}
return
precedence
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment