Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
d7e0303d
Unverified
Commit
d7e0303d
authored
Apr 24, 2022
by
ver217
Committed by
GitHub
Apr 24, 2022
Browse files
[zero] use GeminiMemoryManager when sampling model data (#850)
parent
232142f4
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
5 additions
and
31 deletions
+5
-31
colossalai/gemini/memory_tracer/memstats_collector.py
colossalai/gemini/memory_tracer/memstats_collector.py
+3
-20
colossalai/gemini/tensor_placement_policy.py
colossalai/gemini/tensor_placement_policy.py
+1
-2
colossalai/zero/sharded_model/sharded_model_v2.py
colossalai/zero/sharded_model/sharded_model_v2.py
+0
-3
colossalai/zero/sharded_optim/sharded_optim_v2.py
colossalai/zero/sharded_optim/sharded_optim_v2.py
+1
-6
No files found.
colossalai/gemini/memory_tracer/memstats_collector.py
View file @
d7e0303d
from
colossalai.gemini.memory_tracer
import
GLOBAL_MODEL_DATA_TRACER
from
colossalai.gemini.memory_tracer
import
SyncCudaMemoryMonitor
from
colossalai.utils.memory
import
colo_device_memory_used
from
colossalai.gemini.stateful_tensor
import
StatefulTensor
import
torch
import
time
...
...
@@ -92,7 +92,8 @@ class MemStatsCollector:
"""Sampling model data statistics.
"""
if
self
.
_start_flag
:
cuda_mem
,
cpu_mem
=
GLOBAL_MODEL_DATA_TRACER
.
both_mem_usage
cuda_mem
=
StatefulTensor
.
GST_MGR
.
total_mem
[
'cuda'
]
cpu_mem
=
StatefulTensor
.
GST_MGR
.
total_mem
[
'cpu'
]
self
.
_model_data_cuda_list
.
append
(
cuda_mem
)
self
.
_model_data_cpu_list
.
append
(
cpu_mem
)
...
...
@@ -114,24 +115,6 @@ class MemStatsCollector:
self
.
_sampling_time
.
append
(
time
.
time
())
self
.
_mem_monitor
.
start
()
def
sample_memstats
(
self
)
->
None
:
"""
Sampling memory statistics.
Record the current model data CUDA memory usage as well as system CUDA memory usage.
Advance the sampling cnter.
"""
if
self
.
_start_flag
:
self
.
_model_data_cuda_list
.
append
(
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
)
self
.
_overall_cuda_list
.
append
(
self
.
_mem_monitor
.
finish
())
self
.
_non_model_data_cuda_list
.
append
(
self
.
_overall_cuda_list
[
-
1
]
-
self
.
_model_data_cuda_list
[
-
1
])
self
.
_model_data_cpu_list
.
append
(
GLOBAL_MODEL_DATA_TRACER
.
cpu_usage
)
# FIXME(jiaruifang) cpu sys used should also return from self._mem_monitor()
self
.
_overall_cpu_list
.
append
(
colo_device_memory_used
(
torch
.
device
(
f
'cpu'
)))
self
.
_non_model_data_cpu_list
.
append
(
self
.
_overall_cpu_list
[
-
1
]
-
self
.
_model_data_cpu_list
[
-
1
])
self
.
_sampling_time
.
append
(
time
.
time
())
self
.
_mem_monitor
.
start
()
def
clear
(
self
)
->
None
:
self
.
_model_data_cuda_list
=
[]
self
.
_overall_cuda_list
=
[]
...
...
colossalai/gemini/tensor_placement_policy.py
View file @
d7e0303d
...
...
@@ -7,7 +7,6 @@ from colossalai.utils.memory import colo_device_memory_capacity
from
colossalai.gemini.tensor_utils
import
colo_model_data_tensor_move_inline
,
colo_tensor_mem_usage
from
colossalai.gemini.stateful_tensor
import
StatefulTensor
from
colossalai.gemini.memory_tracer
import
MemStatsCollector
from
colossalai.gemini.memory_tracer
import
GLOBAL_MODEL_DATA_TRACER
from
typing
import
Type
...
...
@@ -79,7 +78,7 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy):
"""
volume
=
0
cuda_capacity
=
colo_device_memory_capacity
(
get_current_device
())
used_cuda_model_data
=
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
used_cuda_model_data
=
StatefulTensor
.
GST_MGR
.
total_mem
[
'cuda'
]
if
warmup
:
# We designate a part of CUDA memory for model data in warmup iterations.
max_cuda_non_model_data_per_period
=
cuda_capacity
*
self
.
_warmup_non_model_data_ratio
...
...
colossalai/zero/sharded_model/sharded_model_v2.py
View file @
d7e0303d
...
...
@@ -13,8 +13,6 @@ from colossalai.engine.paramhooks import BaseParamHookMgr
from
colossalai.logging
import
get_dist_logger
from
colossalai.utils
import
get_current_device
,
disposable
from
colossalai.gemini.memory_tracer.memstats_collector
import
MemStatsCollector
from
colossalai.gemini.memory_tracer.model_data_memtracer
import
\
GLOBAL_MODEL_DATA_TRACER
from
colossalai.utils.memory
import
colo_device_memory_capacity
from
colossalai.zero.shard_utils
import
BaseShardStrategy
from
colossalai.zero.sharded_model.reduce_scatter
import
ReduceScatterBucketer
...
...
@@ -106,7 +104,6 @@ class ShardedModelV2(nn.Module):
self
.
_use_memory_tracer
=
tensor_placement_policy
==
'auto'
if
self
.
_use_memory_tracer
:
GLOBAL_MODEL_DATA_TRACER
.
register_model
(
self
)
self
.
_memstats_collector
=
MemStatsCollector
()
self
.
_start_collect_memstats
=
disposable
(
self
.
_memstats_collector
.
start_collection
)
self
.
_finish_collect_memstats
=
disposable
(
self
.
_memstats_collector
.
finish_collection
)
...
...
colossalai/zero/sharded_optim/sharded_optim_v2.py
View file @
d7e0303d
...
...
@@ -10,10 +10,7 @@ from colossalai.context.parallel_mode import ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.logging
import
get_dist_logger
from
colossalai.nn.optimizer
import
ColossalaiOptimizer
from
colossalai.gemini.memory_tracer.model_data_memtracer
import
\
GLOBAL_MODEL_DATA_TRACER
from
colossalai.gemini.tensor_utils
import
(
colo_model_data_tensor_move_inline
,
colo_model_tensor_clone
,
colo_tensor_mem_usage
)
from
colossalai.gemini.tensor_utils
import
(
colo_model_data_tensor_move_inline
,
colo_tensor_mem_usage
)
from
colossalai.zero.sharded_model
import
ShardedModelV2
from
colossalai.zero.sharded_model._utils
import
cast_tensor_to_fp32
from
torch
import
Tensor
...
...
@@ -130,8 +127,6 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
f
"After init ShardedOptimizerV2 consumes
{
self
.
get_memory_usage
()[
0
]
/
1e6
}
MB CUDA Memory!"
,
ranks
=
[
0
])
self
.
_use_memory_tracer
=
self
.
model
.
use_memory_tracer
if
self
.
_use_memory_tracer
:
GLOBAL_MODEL_DATA_TRACER
.
register_optimizer
(
self
)
@
property
def
loss_scale
(
self
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment