Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
dbf25377
Unverified
Commit
dbf25377
authored
Mar 30, 2023
by
Peter Eastman
Committed by
GitHub
Mar 30, 2023
Browse files
Improved load balancing between GPUs (#4013)
parent
ae42c911
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
14 deletions
+22
-14
platforms/cuda/include/CudaParallelKernels.h
platforms/cuda/include/CudaParallelKernels.h
+2
-1
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+13
-9
platforms/opencl/src/OpenCLParallelKernels.cpp
platforms/opencl/src/OpenCLParallelKernels.cpp
+7
-4
No files found.
platforms/cuda/include/CudaParallelKernels.h
View file @
dbf25377
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2011-20
19
Stanford University and the Authors. *
* Portions copyright (c) 2011-20
23
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -84,6 +84,7 @@ private:
...
@@ -84,6 +84,7 @@ private:
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
double
>
contextNonbondedFractions
;
std
::
vector
<
double
>
contextNonbondedFractions
;
bool
loadBalance
;
int2
*
interactionCounts
;
int2
*
interactionCounts
;
CudaArray
contextForces
;
CudaArray
contextForces
;
void
*
pinnedPositionBuffer
;
void
*
pinnedPositionBuffer
;
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
dbf25377
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2011-202
1
Stanford University and the Authors. *
* Portions copyright (c) 2011-202
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -95,17 +95,17 @@ class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public Cuda
...
@@ -95,17 +95,17 @@ class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public Cuda
public:
public:
FinishComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
FinishComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int2
&
interactionCount
,
CUstream
stream
,
CUevent
event
,
CUevent
localEvent
)
:
bool
&
valid
,
int2
&
interactionCount
,
CUstream
stream
,
CUevent
event
,
CUevent
localEvent
,
bool
loadBalance
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
energy
(
energy
),
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
energy
(
energy
),
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
interactionCount
(
interactionCount
),
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
interactionCount
(
interactionCount
),
stream
(
stream
),
event
(
event
),
localEvent
(
localEvent
)
{
stream
(
stream
),
event
(
event
),
localEvent
(
localEvent
)
,
loadBalance
(
loadBalance
)
{
}
}
void
execute
()
{
void
execute
()
{
// Execute the kernel, then download forces.
// Execute the kernel, then download forces.
ContextSelector
selector
(
cu
);
ContextSelector
selector
(
cu
);
energy
+=
kernel
.
finishComputation
(
context
,
includeForce
,
includeEnergy
,
groups
,
valid
);
energy
+=
kernel
.
finishComputation
(
context
,
includeForce
,
includeEnergy
,
groups
,
valid
);
if
(
cu
.
getComputeForceCount
()
<
200
)
{
if
(
loadBalance
)
{
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT
(
cuCtxSynchronize
(),
"Error synchronizing CUDA context"
);
CHECK_RESULT
(
cuCtxSynchronize
(),
"Error synchronizing CUDA context"
);
...
@@ -137,7 +137,7 @@ private:
...
@@ -137,7 +137,7 @@ private:
ContextImpl
&
context
;
ContextImpl
&
context
;
CudaContext
&
cu
;
CudaContext
&
cu
;
CudaCalcForcesAndEnergyKernel
&
kernel
;
CudaCalcForcesAndEnergyKernel
&
kernel
;
bool
includeForce
,
includeEnergy
;
bool
includeForce
,
includeEnergy
,
loadBalance
;
int
groups
;
int
groups
;
double
&
energy
;
double
&
energy
;
long
long
&
completionTime
;
long
long
&
completionTime
;
...
@@ -182,8 +182,11 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...
@@ -182,8 +182,11 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
int
numContexts
=
data
.
contexts
.
size
();
int
numContexts
=
data
.
contexts
.
size
();
for
(
int
i
=
0
;
i
<
numContexts
;
i
++
)
for
(
int
i
=
0
;
i
<
numContexts
;
i
++
)
getKernel
(
i
).
initialize
(
system
);
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
numContexts
;
i
++
)
for
(
int
i
=
0
;
i
<
contextNonbondedFractions
.
size
();
i
++
)
{
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
numContexts
;
double
x0
=
i
/
(
double
)
contextNonbondedFractions
.
size
();
double
x1
=
(
i
+
1
)
/
(
double
)
contextNonbondedFractions
.
size
();
contextNonbondedFractions
[
i
]
=
x1
*
x1
-
x0
*
x0
;
}
CHECK_RESULT
(
cuEventCreate
(
&
event
,
cu
.
getEventFlags
()),
"Error creating event"
);
CHECK_RESULT
(
cuEventCreate
(
&
event
,
cu
.
getEventFlags
()),
"Error creating event"
);
peerCopyEvent
.
resize
(
numContexts
);
peerCopyEvent
.
resize
(
numContexts
);
peerCopyEventLocal
.
resize
(
numContexts
);
peerCopyEventLocal
.
resize
(
numContexts
);
...
@@ -208,6 +211,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
...
@@ -208,6 +211,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedForceBuffer
,
3
*
(
data
.
contexts
.
size
()
-
1
)
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
long
long
),
CU_MEMHOSTALLOC_PORTABLE
),
"Error allocating pinned memory"
);
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedForceBuffer
,
3
*
(
data
.
contexts
.
size
()
-
1
)
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
long
long
),
CU_MEMHOSTALLOC_PORTABLE
),
"Error allocating pinned memory"
);
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedPositionBuffer
,
cu
.
getPaddedNumAtoms
()
*
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double4
)
:
sizeof
(
float4
)),
CU_MEMHOSTALLOC_PORTABLE
),
"Error allocating pinned memory"
);
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedPositionBuffer
,
cu
.
getPaddedNumAtoms
()
*
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double4
)
:
sizeof
(
float4
)),
CU_MEMHOSTALLOC_PORTABLE
),
"Error allocating pinned memory"
);
}
}
loadBalance
=
(
cu
.
getComputeForceCount
()
<
200
||
cu
.
getComputeForceCount
()
%
30
==
0
);
// Copy coordinates over to each device and execute the kernel.
// Copy coordinates over to each device and execute the kernel.
...
@@ -239,7 +243,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
...
@@ -239,7 +243,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
ComputeContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
ComputeContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
contextForces
,
valid
,
interactionCounts
[
i
],
peerCopyStream
[
i
],
peerCopyEvent
[
i
],
peerCopyEventLocal
[
i
]));
pinnedForceBuffer
,
contextForces
,
valid
,
interactionCounts
[
i
],
peerCopyStream
[
i
],
peerCopyEvent
[
i
],
peerCopyEventLocal
[
i
]
,
loadBalance
));
}
}
data
.
syncContexts
();
data
.
syncContexts
();
CudaContext
&
cu
=
*
data
.
contexts
[
0
];
CudaContext
&
cu
=
*
data
.
contexts
[
0
];
...
@@ -263,7 +267,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
...
@@ -263,7 +267,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
// Balance work between the contexts by transferring a little nonbonded work from the context that
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
// finished last to the one that finished first.
if
(
cu
.
getComputeForceCount
()
<
200
)
{
if
(
loadBalance
)
{
int
firstIndex
=
0
,
lastIndex
=
0
;
int
firstIndex
=
0
,
lastIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
...
...
platforms/opencl/src/OpenCLParallelKernels.cpp
View file @
dbf25377
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2011-20
19
Stanford University and the Authors. *
* Portions copyright (c) 2011-20
23
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -133,8 +133,11 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne
...
@@ -133,8 +133,11 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne
void
OpenCLParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
void
OpenCLParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
initialize
(
system
);
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
contextNonbondedFractions
.
size
();
i
++
)
{
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
double
x0
=
i
/
(
double
)
contextNonbondedFractions
.
size
();
double
x1
=
(
i
+
1
)
/
(
double
)
contextNonbondedFractions
.
size
();
contextNonbondedFractions
[
i
]
=
x1
*
x1
-
x0
*
x0
;
}
}
}
void
OpenCLParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
void
OpenCLParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
@@ -184,7 +187,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
...
@@ -184,7 +187,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
// Balance work between the contexts by transferring a little nonbonded work from the context that
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
// finished last to the one that finished first.
if
(
cl
.
getComputeForceCount
()
<
200
)
{
if
(
cl
.
getComputeForceCount
()
<
200
||
cl
.
getComputeForceCount
()
%
30
==
0
)
{
int
firstIndex
=
0
,
lastIndex
=
0
;
int
firstIndex
=
0
,
lastIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment