Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
e956d93a
Unverified
Commit
e956d93a
authored
Apr 01, 2022
by
Jiarui Fang
Committed by
GitHub
Apr 01, 2022
Browse files
[refactor] memory utils (#577)
parent
104cbbb3
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
265 additions
and
206 deletions
+265
-206
colossalai/engine/gradient_handler/_moe_gradient_handler.py
colossalai/engine/gradient_handler/_moe_gradient_handler.py
+1
-0
colossalai/engine/ophooks/zero_hook.py
colossalai/engine/ophooks/zero_hook.py
+1
-2
colossalai/utils/memory_tracer/async_memtracer.py
colossalai/utils/memory_tracer/async_memtracer.py
+2
-2
colossalai/utils/memory_tracer/memstats_collector.py
colossalai/utils/memory_tracer/memstats_collector.py
+68
-46
colossalai/utils/memory_tracer/test_memstats_collector.py
colossalai/utils/memory_tracer/test_memstats_collector.py
+1
-4
colossalai/utils/memory_utils/memory_monitor.py
colossalai/utils/memory_utils/memory_monitor.py
+0
-23
colossalai/utils/memory_utils/utils.py
colossalai/utils/memory_utils/utils.py
+55
-113
colossalai/zero/shard_utils/tensor_shard_strategy.py
colossalai/zero/shard_utils/tensor_shard_strategy.py
+1
-1
colossalai/zero/shard_utils/tensor_utils.py
colossalai/zero/shard_utils/tensor_utils.py
+117
-0
colossalai/zero/sharded_model/sharded_model_v2.py
colossalai/zero/sharded_model/sharded_model_v2.py
+10
-6
colossalai/zero/sharded_optim/sharded_optim_v2.py
colossalai/zero/sharded_optim/sharded_optim_v2.py
+3
-2
colossalai/zero/sharded_param/sharded_param.py
colossalai/zero/sharded_param/sharded_param.py
+1
-2
tests/test_utils/test_commons.py
tests/test_utils/test_commons.py
+1
-1
tests/test_utils/test_tensor_move.py
tests/test_utils/test_tensor_move.py
+1
-1
tests/test_zero_data_parallel/test_init_context.py
tests/test_zero_data_parallel/test_init_context.py
+3
-3
No files found.
colossalai/engine/gradient_handler/_moe_gradient_handler.py
View file @
e956d93a
...
...
@@ -29,6 +29,7 @@ class MoeGradientHandler(BaseGradientHandler):
if
global_data
>
1
:
epsize_param_dict
=
get_moe_epsize_param_dict
(
self
.
_model
)
# epsize is 1, indicating the params are replicated among processes in data parallelism
# use the ParallelMode.DATA to get data parallel group
# reduce gradients for all parameters in data parallelism
...
...
colossalai/engine/ophooks/zero_hook.py
View file @
e956d93a
...
...
@@ -10,8 +10,7 @@ from colossalai.zero.sharded_param.tensorful_state import TensorState
from
._base_ophook
import
BaseOpHook
from
colossalai.utils.memory_utils.utils
import
\
colo_model_data_tensor_move_inline
from
colossalai.zero.shard_utils.tensor_utils
import
colo_model_data_tensor_move_inline
@
OPHOOKS
.
register_module
...
...
colossalai/utils/memory_tracer/async_memtracer.py
View file @
e956d93a
...
...
@@ -4,8 +4,8 @@ import pickle
import
torch
from
colossalai.utils.memory_utils.utils
import
colo_device_memory_used
from
colossalai.utils
import
get_current_device
from
colossalai.utils.memory_utils.memory_monitor
import
colo_cuda_memory_used
class
AsyncMemoryMonitor
:
...
...
@@ -82,7 +82,7 @@ class AsyncMemoryMonitor:
while
self
.
keep_measuring
:
max_usage
=
max
(
max_usage
,
colo_
cuda
_memory_used
(),
colo_
device
_memory_used
(
get_current_device
()
),
)
sleep
(
self
.
interval
)
return
max_usage
...
...
colossalai/utils/memory_tracer/memstats_collector.py
View file @
e956d93a
from
colossalai.utils.memory_tracer.model_data_memtracer
import
GLOBAL_MODEL_DATA_TRACER
from
colossalai.utils.memory_utils.
memory_monitor
import
colo_
cuda
_memory_used
from
colossalai.utils.memory_utils.
utils
import
colo_
device
_memory_used
from
colossalai.utils
import
get_current_device
import
torch
from
typing
import
Tuple
from
typing
import
List
class
SamplingCounter
:
...
...
@@ -23,45 +23,71 @@ class SamplingCounter:
class
MemStatsCollector
:
"""
A Memory statistic collector.
It works in two phases.
Phase 1. Collection Phase: collect memory usage statistics of CPU and GPU.
The first iteration of DNN training.
Phase 2. Runtime Phase: use the read-only collected stats
The rest iterations of DNN training.
It has a Sampling counter which is reset after DNN training iteration.
"""
def
__init__
(
self
)
->
None
:
"""
Collecting Memory Statistics.
It has two phases.
1. Collection Phase: collect memory usage statistics
2. Runtime Phase: do not collect statistics.
"""
self
.
_sampling_cnter
=
SamplingCounter
()
self
.
_model_data_cuda
=
[]
self
.
_overall_cuda
=
[]
self
.
_model_data_cuda
_list
=
[]
self
.
_overall_cuda
_list
=
[]
# TODO(jiaruifang) Now no cpu mem stats collecting
self
.
_model_data_cpu
=
[]
self
.
_overall_cpu
=
[]
self
.
_model_data_cpu_list
=
[]
self
.
_overall_cpu_list
=
[]
self
.
_start_flag
=
False
@
property
def
overall_cuda
(
self
):
return
self
.
_overall_cuda
@
property
def
model_data_cuda_GB
(
self
):
return
[
elem
/
1e9
for
elem
in
self
.
_model_data_cuda
]
@
property
def
model_data_cuda
(
self
):
return
self
.
_model_data_cuda
@
property
def
non_model_data_cuda_GB
(
self
):
return
[
elem
/
1e9
for
elem
in
self
.
non_model_data_cuda
]
def
overall_mem_stats
(
self
,
device_type
:
str
):
if
device_type
==
'cuda'
:
return
self
.
_overall_cuda_list
elif
device_type
==
'cpu'
:
return
self
.
_overall_cpu_list
else
:
raise
TypeError
@
property
def
non_model_data_cuda
(
self
):
def
model_data_cuda_list
(
self
,
device_type
:
str
,
unit
:
str
=
'B'
)
->
List
[
int
]:
scale
=
1
if
unit
==
'GB'
:
scale
=
1e9
elif
unit
==
'MB'
:
scale
=
1e6
elif
unit
==
'KB'
:
scale
=
1e3
else
:
raise
TypeError
if
device_type
==
'cuda'
:
return
[
elem
/
scale
for
elem
in
self
.
_model_data_cuda_list
]
elif
device_type
==
'cpu'
:
return
[
elem
/
scale
for
elem
in
self
.
_model_data_cpu_list
]
else
:
raise
TypeError
def
non_model_data_cuda_list
(
self
,
device_type
:
str
,
unit
:
str
=
'B'
)
->
List
[
int
]:
"""Non model data stats
"""
return
[(
v1
-
v2
)
for
v1
,
v2
in
zip
(
self
.
_overall_cuda
,
self
.
_model_data_cuda
)]
scale
=
1
if
unit
==
'GB'
:
scale
=
1e9
elif
unit
==
'MB'
:
scale
=
1e6
elif
unit
==
'KB'
:
scale
=
1e3
if
device_type
==
'cuda'
:
return
[(
v1
-
v2
)
/
scale
for
v1
,
v2
in
zip
(
self
.
_overall_cuda_list
,
self
.
_model_data_cuda_list
)]
elif
device_type
==
'cpu'
:
return
[(
v1
-
v2
)
/
scale
for
v1
,
v2
in
zip
(
self
.
_overall_cpu_list
,
self
.
_model_data_cpu_list
)]
else
:
raise
TypeError
def
start_collection
(
self
):
self
.
_start_flag
=
True
...
...
@@ -73,32 +99,28 @@ class MemStatsCollector:
"""
Sampling memory statistics.
Record the current model data CUDA memory usage as well as system CUDA memory usage.
Advance the sampling cnter.
"""
if
self
.
_start_flag
:
sampling_cnt
=
self
.
_sampling_cnter
.
sampling_cnt
assert
sampling_cnt
==
len
(
self
.
_overall_cuda
)
self
.
_model_data_cuda
.
append
(
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
)
self
.
_overall_cuda
.
append
(
colo_cuda_memory_used
(
torch
.
device
(
f
'cuda:
{
get_current_device
()
}
'
)))
self
.
_sampling_cnter
.
advance
()
assert
sampling_cnt
==
len
(
self
.
_overall_cuda_list
)
self
.
_model_data_cuda_list
.
append
(
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
)
self
.
_overall_cuda_list
.
append
(
colo_device_memory_used
(
get_current_device
()))
def
fetch_memstats
(
self
)
->
Tuple
[
int
,
int
]:
"""
returns cuda usage of model data and overall cuda usage.
"""
sampling_cnt
=
self
.
_sampling_cnter
.
sampling_cnt
if
len
(
self
.
_model_data_cuda
)
<
sampling_cnt
:
raise
RuntimeError
return
(
self
.
_model_data_cuda
[
sampling_cnt
],
self
.
_overall_cuda
[
sampling_cnt
])
self
.
_model_data_cpu_list
.
append
(
GLOBAL_MODEL_DATA_TRACER
.
cpu_usage
)
self
.
_overall_cpu_list
.
append
(
colo_device_memory_used
(
torch
.
device
(
f
'cpu'
)))
self
.
_sampling_cnter
.
advance
()
def
reset_sampling_cnter
(
self
)
->
None
:
self
.
_sampling_cnter
.
reset
()
def
clear
(
self
)
->
None
:
self
.
_model_data_cuda
=
[]
self
.
_overall_cuda
=
[]
self
.
_model_data_cuda
_list
=
[]
self
.
_overall_cuda
_list
=
[]
self
.
_model_data_cpu
=
[]
self
.
_overall_cpu
=
[]
self
.
_model_data_cpu
_list
=
[]
self
.
_overall_cpu
_list
=
[]
self
.
_start_flag
=
False
self
.
_sampling_cnter
.
reset
()
colossalai/utils/memory_tracer/test_memstats_collector.py
View file @
e956d93a
...
...
@@ -30,10 +30,7 @@ def test_mem_collector():
collector
.
sample_memstats
()
collector
.
sample_memstats
()
cuda_use
,
overall_use
=
collector
.
fetch_memstats
()
print
(
cuda_use
,
overall_use
)
print
(
collector
.
overall_cuda
)
print
(
collector
.
overall_mem_stats
(
'cuda'
))
if
__name__
==
'__main__'
:
...
...
colossalai/utils/memory_utils/memory_monitor.py
View file @
e956d93a
...
...
@@ -9,29 +9,6 @@ import torch
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.logging
import
get_dist_logger
from
colossalai.utils.cuda
import
get_current_device
from
typing
import
Optional
def
colo_cuda_memory_used
(
device
:
Optional
[
torch
.
device
]
=
None
)
->
int
:
"""Get the free memory info of device.
Args:
device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
Returns:
int: current memory usage, sized by Byte.
"""
if
device
:
assert
device
.
type
==
'cuda'
else
:
device
=
torch
.
device
(
f
'cuda:
{
get_current_device
()
}
'
)
ret
:
int
=
torch
.
cuda
.
memory_allocated
(
device
)
# get the peak memory to report correct data, so reset the counter for the next call
if
hasattr
(
torch
.
cuda
,
"reset_peak_memory_stats"
):
# pytorch 1.4+
torch
.
cuda
.
reset_peak_memory_stats
(
device
)
return
ret
def
bytes_to_GB
(
val
,
decimal
=
2
):
...
...
colossalai/utils/memory_utils/utils.py
View file @
e956d93a
import
torch
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.utils
import
get_current_device
from
colossalai.zero.sharded_param.tensorful_state
import
StatefulTensor
from
typing
import
Tuple
,
Union
from
collections
import
namedtuple
import
psutil
from
colossalai.core
import
global_context
as
gpc
_GLOBAL_CUDA_MEM_FRACTION
=
1.0
def
colo_tensor_mem_usage
(
tensor
:
Union
[
torch
.
Tensor
,
StatefulTensor
])
->
Tuple
[
int
,
int
]:
if
issubclass
(
type
(
tensor
),
StatefulTensor
):
t
=
tensor
.
payload
elif
isinstance
(
tensor
,
torch
.
Tensor
):
t
=
tensor
else
:
return
0
,
0
cuda_use
,
cpu_use
=
0
,
0
mem_use
=
t
.
numel
()
*
t
.
element_size
()
if
t
.
device
.
type
==
'cuda'
:
cuda_use
+=
mem_use
elif
t
.
device
.
type
==
'cpu'
:
cpu_use
+=
mem_use
return
cuda_use
,
cpu_use
# copy from PatrickStar
def
_get_cpu_memory_info
():
ps_mem_info
=
namedtuple
(
"ps_mem_info"
,
[
"total"
,
"free"
,
"cached"
,
"buffers"
,
"used"
])
try
:
# psutil reads the memory info from /proc/memory_info,
# which results in returning the host memory instead of
# that of container.
# Here we try to read the container memory with method in:
# https://stackoverflow.com/a/46213331/5163915
mems
=
{}
with
open
(
"/sys/fs/cgroup/memory/memory.meminfo"
,
"rb"
)
as
f
:
for
line
in
f
:
fields
=
line
.
split
()
mems
[
fields
[
0
]]
=
int
(
fields
[
1
])
*
1024
total
=
mems
[
b
"MemTotal:"
]
free
=
mems
[
b
"MemFree:"
]
cached
=
mems
[
b
"Cached:"
]
buffers
=
mems
[
b
"Buffers:"
]
used
=
total
-
free
-
cached
-
buffers
if
used
<
0
:
used
=
total
-
free
mem_info
=
ps_mem_info
(
total
=
total
,
free
=
free
,
cached
=
cached
,
buffers
=
buffers
,
used
=
used
)
except
FileNotFoundError
:
mems
=
psutil
.
virtual_memory
()
mem_info
=
ps_mem_info
(
total
=
mems
.
total
,
free
=
mems
.
free
,
cached
=
mems
.
cached
,
buffers
=
mems
.
buffers
,
used
=
mems
.
used
,
)
return
mem_info
def
colo_device_memory_used
(
device
)
->
int
:
if
not
isinstance
(
device
,
torch
.
device
):
device
=
torch
.
device
(
f
"cuda:
{
device
}
"
)
if
device
.
type
==
'cpu'
:
mem_info
=
_get_cpu_memory_info
()
# FIXME(jiaruifang) only work for 1-CPU multi-GPU
# CPU memory is sharded with all processes
# Not support multi-GPU multi-CPU
# We need a local_world_size here
ret
=
mem_info
.
used
/
gpc
.
get_world_size
(
ParallelMode
.
DATA
)
return
ret
elif
device
.
type
==
'cuda'
:
ret
:
int
=
torch
.
cuda
.
memory_allocated
(
device
)
# get the peak memory to report correct data, so reset the counter for the next call
if
hasattr
(
torch
.
cuda
,
"reset_peak_memory_stats"
):
# pytorch 1.4+
torch
.
cuda
.
reset_peak_memory_stats
(
device
)
return
ret
def
colo_set_process_memory_fraction
(
ratio
:
float
)
->
None
:
...
...
@@ -44,97 +80,3 @@ def colo_cuda_memory_capacity() -> float:
Get cuda memory capacity of the current cuda.
"""
return
torch
.
cuda
.
get_device_properties
(
get_current_device
()).
total_memory
*
_GLOBAL_CUDA_MEM_FRACTION
def
colo_model_data_tensor_move
(
src_t
:
Union
[
StatefulTensor
,
torch
.
Tensor
],
tgt_t
:
Union
[
StatefulTensor
,
torch
.
Tensor
])
->
None
:
"""
A colossal API for model data tensor move.
The src and target tensors could be resident on both CPU and GPU.
NOTE() The source tensor payload will be removed after this function.
The function will record the communication volume between CPU and GPU.
Args:
t_src (Union[StatefulTensor, torch.Tensor]): source tensor
tgt_t (Union[StatefulTensor, torch.Tensor]): target tensor
"""
if
issubclass
(
type
(
src_t
),
StatefulTensor
):
src_t_payload
=
src_t
.
payload
else
:
src_t_payload
=
src_t
.
data
src_dev
=
src_t_payload
.
device
if
issubclass
(
type
(
tgt_t
),
StatefulTensor
):
tgt_t_payload
=
tgt_t
.
payload
else
:
tgt_t_payload
=
tgt_t
.
data
tgt_t_payload
.
copy_
(
src_t_payload
)
# remove payload of src_t
if
issubclass
(
type
(
src_t
),
StatefulTensor
):
src_t
.
reset_payload
(
torch
.
tensor
([],
device
=
src_dev
,
dtype
=
src_t_payload
.
dtype
))
else
:
src_t
.
data
=
torch
.
tensor
([],
device
=
src_dev
,
dtype
=
src_t_payload
.
dtype
)
def
colo_model_data_tensor_move_inline
(
t
:
Union
[
StatefulTensor
,
torch
.
Tensor
],
target_device
:
Union
[
torch
.
device
,
int
])
->
None
:
"""
move a tensor to the target_device
Args:
t (Union[StatefulTensor, torch.Tensor]): the tensor be moved
"""
if
isinstance
(
t
,
torch
.
Tensor
):
t_payload
=
t
elif
issubclass
(
type
(
t
),
StatefulTensor
):
t_payload
=
t
.
payload
else
:
raise
TypeError
(
'colo_model_data_move_to_cpu dose not accept type {type(t)}'
)
if
isinstance
(
target_device
,
int
):
target_device
=
torch
.
device
(
f
'cuda:
{
target_device
}
'
)
# deal with torch.device('cpu') and torch.device('cpu:0)
if
t_payload
.
device
.
type
==
target_device
.
type
:
return
t_payload
.
data
=
t_payload
.
data
.
to
(
target_device
)
def
colo_model_data_move_to_cpu
(
t
:
Union
[
StatefulTensor
,
torch
.
Tensor
])
->
None
:
"""colo_model_data_move_to_cpu
move a model data tensor from gpu to cpu
Args:
t (Union[StatefulTensor, torch.Tensor]): _description_
"""
if
issubclass
(
type
(
t
),
StatefulTensor
):
t_payload
=
t
.
payload
elif
isinstance
(
t
,
torch
.
Tensor
):
t_payload
=
t
else
:
raise
TypeError
(
'colo_model_data_move_to_cpu dose not accept type {type(t)}'
)
if
t_payload
.
device
.
type
==
'cpu'
:
return
# TODO() optimize the tensor moving with non-blocking
t_payload
.
data
=
t_payload
.
data
.
cpu
()
def
colo_model_tensor_clone
(
t
:
Union
[
StatefulTensor
,
torch
.
Tensor
],
target_device
:
torch
.
device
)
->
torch
.
Tensor
:
"""
Clone a model data tensor
Args:
t (Union[StatefulTensor, torch.Tensor]): a model data tensor
target_device (torch.device): the target device
Returns:
torch.Tensor: a cloned torch tensor
"""
t_payload
=
t
.
payload
if
issubclass
(
type
(
t
),
StatefulTensor
)
else
t
ret
=
t_payload
.
to
(
target_device
)
return
ret
colossalai/zero/shard_utils/tensor_shard_strategy.py
View file @
e956d93a
...
...
@@ -3,7 +3,7 @@ from typing import List, Optional
import
torch
import
torch.distributed
as
dist
from
colossalai.utils
import
get_current_device
from
colossalai.
utils.memory_utils.
utils
import
colo_model_data_tensor_move_inline
from
colossalai.
zero.shard_utils.tensor_
utils
import
colo_model_data_tensor_move_inline
from
colossalai.zero.shard_utils
import
BaseShardStrategy
from
colossalai.zero.shard_utils.commons
import
get_shard
from
colossalai.zero.sharded_param.sharded_tensor
import
ShardedTensor
...
...
colossalai/zero/shard_utils/tensor_utils.py
0 → 100644
View file @
e956d93a
import
torch
from
colossalai.zero.sharded_param.tensorful_state
import
StatefulTensor
from
typing
import
Union
,
Tuple
def
colo_tensor_mem_usage
(
tensor
:
Union
[
torch
.
Tensor
,
StatefulTensor
])
->
Tuple
[
int
,
int
]:
if
issubclass
(
type
(
tensor
),
StatefulTensor
):
t
=
tensor
.
payload
elif
isinstance
(
tensor
,
torch
.
Tensor
):
t
=
tensor
else
:
return
0
,
0
cuda_use
,
cpu_use
=
0
,
0
mem_use
=
t
.
numel
()
*
t
.
element_size
()
if
t
.
device
.
type
==
'cuda'
:
cuda_use
+=
mem_use
elif
t
.
device
.
type
==
'cpu'
:
cpu_use
+=
mem_use
return
cuda_use
,
cpu_use
def
colo_model_data_tensor_move
(
src_t
:
Union
[
StatefulTensor
,
torch
.
Tensor
],
tgt_t
:
Union
[
StatefulTensor
,
torch
.
Tensor
])
->
None
:
"""
A colossal API for model data tensor move.
The src and target tensors could be resident on both CPU and GPU.
NOTE() The source tensor payload will be removed after this function.
The function will record the communication volume between CPU and GPU.
Args:
t_src (Union[StatefulTensor, torch.Tensor]): source tensor
tgt_t (Union[StatefulTensor, torch.Tensor]): target tensor
"""
if
issubclass
(
type
(
src_t
),
StatefulTensor
):
src_t_payload
=
src_t
.
payload
else
:
src_t_payload
=
src_t
.
data
src_dev
=
src_t_payload
.
device
if
issubclass
(
type
(
tgt_t
),
StatefulTensor
):
tgt_t_payload
=
tgt_t
.
payload
else
:
tgt_t_payload
=
tgt_t
.
data
tgt_t_payload
.
copy_
(
src_t_payload
)
# remove payload of src_t
if
issubclass
(
type
(
src_t
),
StatefulTensor
):
src_t
.
reset_payload
(
torch
.
tensor
([],
device
=
src_dev
,
dtype
=
src_t_payload
.
dtype
))
else
:
src_t
.
data
=
torch
.
tensor
([],
device
=
src_dev
,
dtype
=
src_t_payload
.
dtype
)
def
colo_model_data_tensor_move_inline
(
t
:
Union
[
StatefulTensor
,
torch
.
Tensor
],
target_device
:
Union
[
torch
.
device
,
int
])
->
None
:
"""
move a tensor to the target_device
Args:
t (Union[StatefulTensor, torch.Tensor]): the tensor be moved
target_device: a traget device, if type is int, it the index of cuda card.
"""
if
isinstance
(
t
,
torch
.
Tensor
):
t_payload
=
t
elif
issubclass
(
type
(
t
),
StatefulTensor
):
t_payload
=
t
.
payload
else
:
raise
TypeError
(
'colo_model_data_move_to_cpu dose not accept type {type(t)}'
)
if
not
isinstance
(
target_device
,
torch
.
device
):
target_device
=
torch
.
device
(
f
'cuda:
{
target_device
}
'
)
# deal with torch.device('cpu') and torch.device('cpu:0)
if
t_payload
.
device
.
type
==
target_device
.
type
:
return
t_payload
.
data
=
t_payload
.
data
.
to
(
target_device
)
def
colo_model_data_move_to_cpu
(
t
:
Union
[
StatefulTensor
,
torch
.
Tensor
])
->
None
:
"""colo_model_data_move_to_cpu
move a model data tensor from gpu to cpu
Args:
t (Union[StatefulTensor, torch.Tensor]): _description_
"""
if
issubclass
(
type
(
t
),
StatefulTensor
):
t_payload
=
t
.
payload
elif
isinstance
(
t
,
torch
.
Tensor
):
t_payload
=
t
else
:
raise
TypeError
(
'colo_model_data_move_to_cpu dose not accept type {type(t)}'
)
if
t_payload
.
device
.
type
==
'cpu'
:
return
# TODO() optimize the tensor moving with non-blocking
t_payload
.
data
=
t_payload
.
data
.
cpu
()
def
colo_model_tensor_clone
(
t
:
Union
[
StatefulTensor
,
torch
.
Tensor
],
target_device
:
torch
.
device
)
->
torch
.
Tensor
:
"""
Clone a model data tensor
Args:
t (Union[StatefulTensor, torch.Tensor]): a model data tensor
target_device (torch.device): the target device
Returns:
torch.Tensor: a cloned torch tensor
"""
t_payload
=
t
.
payload
if
issubclass
(
type
(
t
),
StatefulTensor
)
else
t
ret
=
t_payload
.
to
(
target_device
)
return
ret
colossalai/zero/sharded_model/sharded_model_v2.py
View file @
e956d93a
...
...
@@ -16,8 +16,9 @@ from colossalai.utils import get_current_device
from
colossalai.utils.memory_tracer.memstats_collector
import
MemStatsCollector
from
colossalai.utils.memory_tracer.model_data_memtracer
import
\
GLOBAL_MODEL_DATA_TRACER
from
colossalai.utils.memory_utils.utils
import
(
colo_cuda_memory_capacity
,
colo_model_data_move_to_cpu
)
from
colossalai.utils.memory_utils.utils
import
colo_cuda_memory_capacity
from
colossalai.zero.shard_utils
import
BaseShardStrategy
from
colossalai.zero.shard_utils.tensor_utils
import
colo_model_data_move_to_cpu
from
colossalai.zero.sharded_model.reduce_scatter
import
ReduceScatterBucketer
from
colossalai.zero.sharded_param.tensorful_state
import
TensorState
from
torch.distributed
import
ProcessGroup
...
...
@@ -160,11 +161,13 @@ class ShardedModelV2(nn.Module):
with
open
(
filename
,
'w+'
)
as
f
:
f
.
write
(
f
'cuda reserved
{
torch
.
cuda
.
memory_reserved
(
get_current_device
())
/
1e9
}
GB
\n
'
)
f
.
write
(
f
'cuda max allocated
{
torch
.
cuda
.
max_memory_allocated
(
get_current_device
())
/
1e9
}
GB
\n
'
)
f
.
write
(
'model data
\n
'
)
f
.
write
(
str
(
self
.
_memstats_collector
.
model_data_cuda_
GB
))
f
.
write
(
'
CUDA
model data
(GB)
\n
'
)
f
.
write
(
str
(
self
.
_memstats_collector
.
model_data_cuda_
list
(
'cuda'
,
'GB'
)
))
f
.
write
(
'
\n
'
)
f
.
write
(
'non model data
\n
'
)
f
.
write
(
str
(
self
.
_memstats_collector
.
non_model_data_cuda_GB
))
f
.
write
(
'CUDA non model data (GB)
\n
'
)
f
.
write
(
str
(
self
.
_memstats_collector
.
non_model_data_cuda_list
(
'cuda'
,
'GB'
)))
f
.
write
(
'CPU non model data (GB)
\n
'
)
f
.
write
(
str
(
self
.
_memstats_collector
.
non_model_data_cuda_list
(
'cpu'
,
'GB'
)))
f
.
write
(
'
\n
'
)
def
_pre_forward_operations
(
self
):
...
...
@@ -209,7 +212,8 @@ class ShardedModelV2(nn.Module):
# the way to calculate margin space is based on the assumption that
# model data is fixed in cuda during training.
# cuda margin space can be used to store OS.
self
.
_cuda_margin_space
=
colo_cuda_memory_capacity
()
-
max
(
self
.
_memstats_collector
.
overall_cuda
)
self
.
_cuda_margin_space
=
colo_cuda_memory_capacity
()
-
max
(
self
.
_memstats_collector
.
overall_mem_stats
(
'cuda'
))
self
.
_iter_cnter
+=
1
@
torch
.
no_grad
()
...
...
colossalai/zero/sharded_optim/sharded_optim_v2.py
View file @
e956d93a
...
...
@@ -12,12 +12,13 @@ from colossalai.logging import get_dist_logger
from
colossalai.nn.optimizer
import
ColossalaiOptimizer
from
colossalai.utils.memory_tracer.model_data_memtracer
import
\
GLOBAL_MODEL_DATA_TRACER
from
colossalai.utils.memory_utils.utils
import
(
colo_model_data_tensor_move_inline
,
colo_model_tensor_clone
,
colo_tensor_mem_usage
)
from
colossalai.zero.shard_utils.tensor_utils
import
(
colo_model_tensor_clone
,
colo_tensor_mem_usage
)
from
colossalai.zero.sharded_model
import
ShardedModelV2
from
colossalai.zero.sharded_model._utils
import
cast_tensor_to_fp32
from
colossalai.zero.sharded_optim._utils
import
has_inf_or_nan
from
colossalai.zero.sharded_param.tensorful_state
import
(
StatefulTensor
,
TensorState
)
from
colossalai.zero.shard_utils.tensor_utils
import
colo_model_data_tensor_move_inline
from
torch
import
Tensor
from
torch.distributed
import
ProcessGroup
from
torch.nn.parameter
import
Parameter
...
...
colossalai/zero/sharded_param/sharded_param.py
View file @
e956d93a
import
torch
import
torch.distributed
as
dist
from
colossalai.zero.sharded_param
import
ShardedTensor
from
typing
import
Optional
,
Tuple
from
colossalai.
utils.memory_utils.
utils
import
colo_tensor_mem_usage
from
colossalai.
zero.shard_utils.tensor_
utils
import
colo_tensor_mem_usage
from
.tensorful_state
import
StatefulTensor
,
TensorState
...
...
tests/test_utils/test_commons.py
View file @
e956d93a
from
colossalai.
utils.memory_utils.
utils
import
colo_model_data_tensor_move
,
colo_model_data_tensor_move_inline
from
colossalai.
zero.shard_utils.tensor_
utils
import
colo_model_data_tensor_move
,
colo_model_data_tensor_move_inline
from
colossalai.utils
import
free_port
from
colossalai.testing
import
rerun_on_exception
from
colossalai.zero.sharded_param
import
ShardedTensor
...
...
tests/test_utils/test_tensor_move.py
View file @
e956d93a
import
pytest
from
colossalai.utils.cuda
import
get_current_device
from
colossalai.
utils.memory_utils.
utils
import
colo_model_data_tensor_move
,
colo_model_data_tensor_move_inline
from
colossalai.
zero.shard_utils.tensor_
utils
import
colo_model_data_tensor_move
,
colo_model_data_tensor_move_inline
from
colossalai.utils
import
free_port
from
colossalai.zero.sharded_param
import
ShardedTensor
import
colossalai
...
...
tests/test_zero_data_parallel/test_init_context.py
View file @
e956d93a
...
...
@@ -13,7 +13,7 @@ from colossalai.utils import free_port
from
colossalai.utils.cuda
import
get_current_device
from
colossalai.utils.memory_tracer.model_data_memtracer
import
\
colo_model_mem_usage
from
colossalai.utils.memory_utils.
memory_monitor
import
colo_
cuda
_memory_used
from
colossalai.utils.memory_utils.
utils
import
colo_
device
_memory_used
from
colossalai.zero.init_ctx
import
ZeroInitContext
from
colossalai.zero.shard_utils
import
(
BucketTensorShardStrategy
,
TensorShardStrategy
)
from
tests.components_to_test.registry
import
non_distributed_component_funcs
...
...
@@ -51,10 +51,10 @@ def run_model_test(init_device_type, shard_strategy_class):
assert
param
.
colo_attr
.
sharded_data_tensor
.
payload
.
device
.
type
==
init_device
.
type
,
\
f
'
{
param
.
colo_attr
.
sharded_data_tensor
.
payload
.
device
.
type
}
vs.
{
init_device
.
type
}
'
cuda_mem_use
,
cpu_mem_use
=
colo_model_mem_usage
(
model
)
cuda_mem_use
,
_
=
colo_model_mem_usage
(
model
)
model_data_cuda_mem_MB
=
cuda_mem_use
/
1e6
logger
.
info
(
f
"Existing ZeRO Context.
\n
Model Data CUDA Memory
{
model_data_cuda_mem_MB
}
MB"
,
ranks
=
[
0
])
sys_cuda_mem_MB
=
colo_
cuda
_memory_used
()
/
1e6
sys_cuda_mem_MB
=
colo_
device
_memory_used
(
get_current_device
()
)
/
1e6
logger
.
info
(
f
"System CUDA Memory Usage
{
sys_cuda_mem_MB
}
MB"
,
ranks
=
[
0
])
logger
.
info
(
f
"Model Number Parameter
{
model_numel_tensor
.
numpy
()[
0
]
/
1e6
}
M"
,
ranks
=
[
0
])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment