Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
f32c804b
Commit
f32c804b
authored
Nov 04, 2014
by
peastman
Browse files
Improvements to multi-GPU performance
parent
5b591ab0
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
8 deletions
+16
-8
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+2
-0
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+12
-8
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+2
-0
No files found.
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
f32c804b
...
@@ -342,6 +342,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -342,6 +342,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
void
CudaNonbondedUtilities
::
prepareInteractions
()
{
void
CudaNonbondedUtilities
::
prepareInteractions
()
{
if
(
!
useCutoff
)
if
(
!
useCutoff
)
return
;
return
;
if
(
numTiles
==
0
)
return
;
if
(
usePeriodic
)
{
if
(
usePeriodic
)
{
double4
box
=
context
.
getPeriodicBoxSize
();
double4
box
=
context
.
getPeriodicBoxSize
();
double
minAllowedSize
=
1.999999
*
cutoff
;
double
minAllowedSize
=
1.999999
*
cutoff
;
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
f32c804b
...
@@ -71,12 +71,7 @@ public:
...
@@ -71,12 +71,7 @@ public:
cu
.
setAsCurrent
();
cu
.
setAsCurrent
();
if
(
cu
.
getContextIndex
()
>
0
)
{
if
(
cu
.
getContextIndex
()
>
0
)
{
if
(
cu
.
getPlatformData
().
peerAccessSupported
&&
false
)
{
// Why is the peer-to-peer copy slower???
if
(
!
cu
.
getPlatformData
().
peerAccessSupported
)
{
CudaContext
&
context0
=
*
cu
.
getPlatformData
().
contexts
[
0
];
int
numBytes
=
cu
.
getPosq
().
getSize
()
*
cu
.
getPosq
().
getElementSize
();
CHECK_RESULT
(
cuMemcpyAsync
(
cu
.
getPosq
().
getDevicePointer
(),
context0
.
getPosq
().
getDevicePointer
(),
numBytes
,
0
),
"Error copying positions"
);
}
else
{
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
cuStreamWaitEvent
(
cu
.
getCurrentStream
(),
event
,
0
);
cu
.
getPosq
().
upload
(
pinnedMemory
,
false
);
cu
.
getPosq
().
upload
(
pinnedMemory
,
false
);
}
}
...
@@ -117,7 +112,8 @@ public:
...
@@ -117,7 +112,8 @@ public:
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
}
}
else
{
else
{
CHECK_RESULT
(
cuCtxSynchronize
(),
"Error synchronizing CUDA context"
);
// In principle this should make the load balancing more accurate, but in practice it just seems to make things slower.
//CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
}
}
}
}
completionTime
=
getTime
();
completionTime
=
getTime
();
...
@@ -175,10 +171,18 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
...
@@ -175,10 +171,18 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
// Copy coordinates over to each device and execute the kernel.
// Copy coordinates over to each device and execute the kernel.
if
(
!
(
cu
.
getPlatformData
().
peerAccessSupported
&&
false
))
{
// Why is this faster than a peer-to-peer copy???
if
(
!
cu
.
getPlatformData
().
peerAccessSupported
)
{
cu
.
getPosq
().
download
(
pinnedPositionBuffer
,
false
);
cu
.
getPosq
().
download
(
pinnedPositionBuffer
,
false
);
cuEventRecord
(
event
,
cu
.
getCurrentStream
());
cuEventRecord
(
event
,
cu
.
getCurrentStream
());
}
}
else
{
int
numBytes
=
cu
.
getPosq
().
getSize
()
*
cu
.
getPosq
().
getElementSize
();
for
(
int
i
=
1
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
data
.
contexts
[
i
]
->
setAsCurrent
();
CHECK_RESULT
(
cuMemcpyAsync
(
data
.
contexts
[
i
]
->
getPosq
().
getDevicePointer
(),
cu
.
getPosq
().
getDevicePointer
(),
numBytes
,
0
),
"Error copying positions"
);
}
cu
.
setAsCurrent
();
}
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
data
.
contextEnergy
[
i
]
=
0.0
;
data
.
contextEnergy
[
i
]
=
0.0
;
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
f32c804b
...
@@ -386,6 +386,8 @@ static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int
...
@@ -386,6 +386,8 @@ static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int
void
OpenCLNonbondedUtilities
::
prepareInteractions
()
{
void
OpenCLNonbondedUtilities
::
prepareInteractions
()
{
if
(
!
useCutoff
)
if
(
!
useCutoff
)
return
;
return
;
if
(
numTiles
==
0
)
return
;
if
(
usePeriodic
)
{
if
(
usePeriodic
)
{
mm_float4
box
=
context
.
getPeriodicBoxSize
();
mm_float4
box
=
context
.
getPeriodicBoxSize
();
double
minAllowedSize
=
1.999999
*
cutoff
;
double
minAllowedSize
=
1.999999
*
cutoff
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment