Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
dbf25377
Unverified
Commit
dbf25377
authored
Mar 30, 2023
by
Peter Eastman
Committed by
GitHub
Mar 30, 2023
Browse files
Improved load balancing between GPUs (#4013)
parent
ae42c911
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
14 deletions
+22
-14
platforms/cuda/include/CudaParallelKernels.h
platforms/cuda/include/CudaParallelKernels.h
+2
-1
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+13
-9
platforms/opencl/src/OpenCLParallelKernels.cpp
platforms/opencl/src/OpenCLParallelKernels.cpp
+7
-4
No files found.
platforms/cuda/include/CudaParallelKernels.h
View file @
dbf25377
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-20
19
Stanford University and the Authors. *
* Portions copyright (c) 2011-20
23
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -84,6 +84,7 @@ private:
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
double
>
contextNonbondedFractions
;
bool
loadBalance
;
int2
*
interactionCounts
;
CudaArray
contextForces
;
void
*
pinnedPositionBuffer
;
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
dbf25377
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-202
1
Stanford University and the Authors. *
* Portions copyright (c) 2011-202
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -95,17 +95,17 @@ class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public Cuda
public:
FinishComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int2
&
interactionCount
,
CUstream
stream
,
CUevent
event
,
CUevent
localEvent
)
:
bool
&
valid
,
int2
&
interactionCount
,
CUstream
stream
,
CUevent
event
,
CUevent
localEvent
,
bool
loadBalance
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
energy
(
energy
),
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
interactionCount
(
interactionCount
),
stream
(
stream
),
event
(
event
),
localEvent
(
localEvent
)
{
stream
(
stream
),
event
(
event
),
localEvent
(
localEvent
)
,
loadBalance
(
loadBalance
)
{
}
void
execute
()
{
// Execute the kernel, then download forces.
ContextSelector
selector
(
cu
);
energy
+=
kernel
.
finishComputation
(
context
,
includeForce
,
includeEnergy
,
groups
,
valid
);
if
(
cu
.
getComputeForceCount
()
<
200
)
{
if
(
loadBalance
)
{
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT
(
cuCtxSynchronize
(),
"Error synchronizing CUDA context"
);
...
...
@@ -137,7 +137,7 @@ private:
ContextImpl
&
context
;
CudaContext
&
cu
;
CudaCalcForcesAndEnergyKernel
&
kernel
;
bool
includeForce
,
includeEnergy
;
bool
includeForce
,
includeEnergy
,
loadBalance
;
int
groups
;
double
&
energy
;
long
long
&
completionTime
;
...
...
@@ -182,8 +182,11 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
int
numContexts
=
data
.
contexts
.
size
();
for
(
int
i
=
0
;
i
<
numContexts
;
i
++
)
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
numContexts
;
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
numContexts
;
for
(
int
i
=
0
;
i
<
contextNonbondedFractions
.
size
();
i
++
)
{
double
x0
=
i
/
(
double
)
contextNonbondedFractions
.
size
();
double
x1
=
(
i
+
1
)
/
(
double
)
contextNonbondedFractions
.
size
();
contextNonbondedFractions
[
i
]
=
x1
*
x1
-
x0
*
x0
;
}
CHECK_RESULT
(
cuEventCreate
(
&
event
,
cu
.
getEventFlags
()),
"Error creating event"
);
peerCopyEvent
.
resize
(
numContexts
);
peerCopyEventLocal
.
resize
(
numContexts
);
...
...
@@ -208,6 +211,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedForceBuffer
,
3
*
(
data
.
contexts
.
size
()
-
1
)
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
long
long
),
CU_MEMHOSTALLOC_PORTABLE
),
"Error allocating pinned memory"
);
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedPositionBuffer
,
cu
.
getPaddedNumAtoms
()
*
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double4
)
:
sizeof
(
float4
)),
CU_MEMHOSTALLOC_PORTABLE
),
"Error allocating pinned memory"
);
}
loadBalance
=
(
cu
.
getComputeForceCount
()
<
200
||
cu
.
getComputeForceCount
()
%
30
==
0
);
// Copy coordinates over to each device and execute the kernel.
...
...
@@ -239,7 +243,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
ComputeContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
contextForces
,
valid
,
interactionCounts
[
i
],
peerCopyStream
[
i
],
peerCopyEvent
[
i
],
peerCopyEventLocal
[
i
]));
pinnedForceBuffer
,
contextForces
,
valid
,
interactionCounts
[
i
],
peerCopyStream
[
i
],
peerCopyEvent
[
i
],
peerCopyEventLocal
[
i
]
,
loadBalance
));
}
data
.
syncContexts
();
CudaContext
&
cu
=
*
data
.
contexts
[
0
];
...
...
@@ -263,7 +267,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
if
(
cu
.
getComputeForceCount
()
<
200
)
{
if
(
loadBalance
)
{
int
firstIndex
=
0
,
lastIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
...
...
platforms/opencl/src/OpenCLParallelKernels.cpp
View file @
dbf25377
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-20
19
Stanford University and the Authors. *
* Portions copyright (c) 2011-20
23
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -133,8 +133,11 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne
void
OpenCLParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
for
(
int
i
=
0
;
i
<
contextNonbondedFractions
.
size
();
i
++
)
{
double
x0
=
i
/
(
double
)
contextNonbondedFractions
.
size
();
double
x1
=
(
i
+
1
)
/
(
double
)
contextNonbondedFractions
.
size
();
contextNonbondedFractions
[
i
]
=
x1
*
x1
-
x0
*
x0
;
}
}
void
OpenCLParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
...
@@ -184,7 +187,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
if
(
cl
.
getComputeForceCount
()
<
200
)
{
if
(
cl
.
getComputeForceCount
()
<
200
||
cl
.
getComputeForceCount
()
%
30
==
0
)
{
int
firstIndex
=
0
,
lastIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment