Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
f0e65455
Unverified
Commit
f0e65455
authored
Apr 25, 2022
by
HELSON
Committed by
GitHub
Apr 25, 2022
Browse files
[gemini] polish code (#855)
parent
29159d9b
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
8 deletions
+6
-8
colossalai/gemini/stateful_tensor_mgr.py
colossalai/gemini/stateful_tensor_mgr.py
+2
-3
colossalai/gemini/tensor_placement_policy.py
colossalai/gemini/tensor_placement_policy.py
+4
-5
No files found.
colossalai/gemini/stateful_tensor_mgr.py
View file @
f0e65455
...
...
@@ -42,7 +42,7 @@ class StatefulTensorMgr(object):
by mem_stats_collector, which should belongs to a Sharded Model.
"""
# find stateful tensor in state COMPUTE
cuda_demand
=
0
cuda_demand
=
StatefulTensor
.
GST_MGR
.
state_mem
[
'cpu'
][
TensorState
.
COMPUTE
]
move_to_cuda_tensor_list
=
[]
hold_cuda_tensor_list
=
[]
for
tensor
in
self
.
_stateful_tensor_list
:
...
...
@@ -55,7 +55,6 @@ class StatefulTensorMgr(object):
elif
tensor
.
device
.
type
==
'cpu'
:
if
tensor
.
state
==
TensorState
.
COMPUTE
:
move_to_cuda_tensor_list
.
append
(
tensor
)
cuda_demand
+=
colo_tensor_mem_usage
(
tensor
.
payload
)[
1
]
else
:
raise
RuntimeError
self
.
_cpu_gpu_move_volume
+=
self
.
_tensor_placement_policy
.
evict_tensors
(
hold_cuda_tensor_list
,
...
...
@@ -66,7 +65,7 @@ class StatefulTensorMgr(object):
# move COMPUTE tensors to CUDA
for
t
in
move_to_cuda_tensor_list
:
colo_model_data_tensor_move_inline
(
t
,
get_current_device
())
self
.
_cpu_gpu_move_volume
+=
t
.
payload
.
numel
()
*
t
.
payload
.
element
_size
()
self
.
_cpu_gpu_move_volume
+=
t
.
payload_size
@
property
def
cpu_gpu_move_volume
(
self
):
...
...
colossalai/gemini/tensor_placement_policy.py
View file @
f0e65455
...
...
@@ -76,7 +76,6 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy):
Returns:
int: the volume of memory that is evicted
"""
volume
=
0
cuda_capacity
=
colo_device_memory_capacity
(
get_current_device
())
used_cuda_model_data
=
StatefulTensor
.
GST_MGR
.
total_mem
[
'cuda'
]
if
warmup
:
...
...
@@ -88,11 +87,12 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy):
cuda_capacity
*=
self
.
_steady_cuda_cap_ratio
total_cuda_model_data
=
cuda_capacity
-
max_cuda_non_model_data_per_period
avail_cuda_model_data
=
total_cuda_model_data
-
used_cuda_model_data
freed_cuda_model_data
=
0
if
avail_cuda_model_data
<
cuda_demand
:
# Move cuda_demand - avail_cuda_model_data volume of tensors
# to_free_cuda_model_data = cuda_demand - avail_cuda_model_data
to_free_cuda_model_data
=
cuda_demand
-
avail_cuda_model_data
freed_cuda_model_data
=
0
to_free_tensor_list
=
hold_cuda_tensor_list
if
not
warmup
:
next_compute_idx
=
{
t
:
len
(
compute_list
)
for
t
in
hold_cuda_tensor_list
}
...
...
@@ -104,15 +104,14 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy):
for
t
in
to_free_tensor_list
:
if
freed_cuda_model_data
>=
to_free_cuda_model_data
:
break
freed_cuda_model_data
+=
colo_tensor_mem_usage
(
t
)[
0
]
freed_cuda_model_data
+=
t
.
payload_size
colo_model_data_tensor_move_inline
(
t
,
torch
.
device
(
'cpu'
))
volume
+=
t
.
payload
.
numel
()
*
t
.
payload
.
element_size
()
if
freed_cuda_model_data
<
to_free_cuda_model_data
:
raise
RuntimeError
(
f
"Adjust layout failed! No enough CUDA memory! Need
{
to_free_cuda_model_data
}
, freed
{
freed_cuda_model_data
}
"
)
return
volume
return
freed_cuda_model_data
class
TensorPlacementPolicyFactory
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment