Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
0035b7be
"git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "a5883aa7909070480d218b62ff8f3e987e7eebd8"
Unverified
Commit
0035b7be
authored
Mar 24, 2022
by
Jiarui Fang
Committed by
GitHub
Mar 24, 2022
Browse files
[memory] add model data tensor moving api (#503)
parent
65ad47c3
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
137 additions
and
50 deletions
+137
-50
colossalai/utils/__init__.py
colossalai/utils/__init__.py
+1
-1
colossalai/utils/commons/memory.py
colossalai/utils/commons/memory.py
+0
-9
colossalai/utils/memory_tracer/allocator.py
colossalai/utils/memory_tracer/allocator.py
+0
-19
colossalai/utils/memory_tracer/commons.py
colossalai/utils/memory_tracer/commons.py
+0
-11
colossalai/utils/memory_tracer/model_data_memtracer.py
colossalai/utils/memory_tracer/model_data_memtracer.py
+12
-3
colossalai/utils/memory_utils/__init__.py
colossalai/utils/memory_utils/__init__.py
+0
-0
colossalai/utils/memory_utils/memory_monitor.py
colossalai/utils/memory_utils/memory_monitor.py
+1
-1
colossalai/utils/memory_utils/utils.py
colossalai/utils/memory_utils/utils.py
+59
-0
colossalai/zero/sharded_model/sharded_model_v2.py
colossalai/zero/sharded_model/sharded_model_v2.py
+3
-4
colossalai/zero/sharded_optim/sharded_optim_v2.py
colossalai/zero/sharded_optim/sharded_optim_v2.py
+12
-2
tests/test_utils/test_commons.py
tests/test_utils/test_commons.py
+49
-0
No files found.
colossalai/utils/__init__.py
View file @
0035b7be
...
@@ -8,7 +8,7 @@ from .common import (clip_grad_norm_fp32, conditional_context, copy_tensor_paral
...
@@ -8,7 +8,7 @@ from .common import (clip_grad_norm_fp32, conditional_context, copy_tensor_paral
sync_model_param
)
sync_model_param
)
from
.data_sampler
import
DataParallelSampler
,
get_dataloader
from
.data_sampler
import
DataParallelSampler
,
get_dataloader
from
.gradient_accumulation
import
accumulate_gradient
from
.gradient_accumulation
import
accumulate_gradient
from
.memory
import
report_memory_usage
from
.memory
_utils.memory_monitor
import
report_memory_usage
from
.timer
import
MultiTimer
,
Timer
from
.timer
import
MultiTimer
,
Timer
from
.tensor_detector
import
TensorDetector
from
.tensor_detector
import
TensorDetector
...
...
colossalai/utils/commons/memory.py
deleted
100644 → 0
View file @
65ad47c3
import
torch
from
colossalai.utils
import
get_current_device
def
col_cuda_memory_capacity
():
"""
Get cuda memory capacity of the current cuda.
"""
return
torch
.
cuda
.
get_device_properties
(
get_current_device
()).
total_memory
colossalai/utils/memory_tracer/allocator.py
deleted
100644 → 0
View file @
65ad47c3
import
torch
from
colossalai.utils.memory_tracer.model_data_memtracer
import
GLOBAL_MODEL_DATA_TRACER
def
col_move_to_cpu
(
t
:
torch
.
Tensor
):
assert
isinstance
(
t
,
torch
.
Tensor
)
if
t
.
device
.
type
==
'cpu'
:
return
GLOBAL_MODEL_DATA_TRACER
.
delete_tensor
(
t
)
t
.
data
=
t
.
data
.
cpu
()
def
col_modeldata_allocate
(
device
:
torch
.
device
)
->
torch
.
Tensor
:
pass
def
col_modeldata_release
(
t
:
torch
.
Tensor
):
pass
colossalai/utils/memory_tracer/commons.py
deleted
100644 → 0
View file @
65ad47c3
from
colossalai.zero.sharded_param
import
ShardedTensor
from
typing
import
Union
import
torch
def
col_tensor_mem_usage
(
t
:
Union
[
torch
.
Tensor
,
ShardedTensor
])
->
int
:
if
isinstance
(
t
,
ShardedTensor
):
target
=
t
.
payload
else
:
target
=
t
return
target
.
numel
()
*
target
.
element_size
()
colossalai/utils/memory_tracer/model_data_memtracer.py
View file @
0035b7be
from
colossalai.context.singleton_meta
import
SingletonMeta
from
colossalai.context.singleton_meta
import
SingletonMeta
from
colossalai.
utils.memory_tracer.commons
import
col_tensor_mem_usage
from
colossalai.
zero.sharded_param.sharded_tensor
import
ShardedTensor
import
torch
import
torch
from
typing
import
Union
def
_col_tensor_mem_usage
(
t
:
Union
[
torch
.
Tensor
,
ShardedTensor
])
->
int
:
if
isinstance
(
t
,
ShardedTensor
):
target
=
t
.
payload
else
:
target
=
t
return
target
.
numel
()
*
target
.
element_size
()
class
ModelDataTracer
(
metaclass
=
SingletonMeta
):
class
ModelDataTracer
(
metaclass
=
SingletonMeta
):
...
@@ -16,12 +25,12 @@ class ModelDataTracer(metaclass=SingletonMeta):
...
@@ -16,12 +25,12 @@ class ModelDataTracer(metaclass=SingletonMeta):
def
add_tensor
(
self
,
t
:
torch
.
Tensor
):
def
add_tensor
(
self
,
t
:
torch
.
Tensor
):
assert
isinstance
(
t
,
torch
.
Tensor
),
f
"ModelDataTracer add_tensor() should accept a torch.Tensor"
assert
isinstance
(
t
,
torch
.
Tensor
),
f
"ModelDataTracer add_tensor() should accept a torch.Tensor"
mem_use
=
col_tensor_mem_usage
(
t
)
mem_use
=
_
col_tensor_mem_usage
(
t
)
self
.
_cuda_usage
+=
mem_use
self
.
_cuda_usage
+=
mem_use
def
delete_tensor
(
self
,
t
:
torch
.
Tensor
):
def
delete_tensor
(
self
,
t
:
torch
.
Tensor
):
assert
isinstance
(
t
,
torch
.
Tensor
),
f
"ModelDataTracer delete_tensor() should accept a torch.Tensor"
assert
isinstance
(
t
,
torch
.
Tensor
),
f
"ModelDataTracer delete_tensor() should accept a torch.Tensor"
mem_use
=
col_tensor_mem_usage
(
t
)
mem_use
=
_
col_tensor_mem_usage
(
t
)
self
.
_cuda_usage
-=
mem_use
self
.
_cuda_usage
-=
mem_use
@
property
@
property
...
...
colossalai/utils/memory_utils/__init__.py
0 → 100644
View file @
0035b7be
colossalai/utils/memory.py
→
colossalai/utils/memory
_utils/memory_monitor
.py
View file @
0035b7be
...
@@ -63,5 +63,5 @@ def report_memory_usage(message, logger=None, report_cpu=False):
...
@@ -63,5 +63,5 @@ def report_memory_usage(message, logger=None, report_cpu=False):
logger
.
info
(
full_log
)
logger
.
info
(
full_log
)
# get the peak memory to report correct data, so reset the counter for the next call
# get the peak memory to report correct data, so reset the counter for the next call
if
hasattr
(
torch
.
cuda
,
"reset_peak_memory_stats"
):
# pytorch 1.4+
if
hasattr
(
torch
.
cuda
,
"reset_peak_memory_stats"
):
# pytorch 1.4+
torch
.
cuda
.
reset_peak_memory_stats
()
torch
.
cuda
.
reset_peak_memory_stats
()
colossalai/utils/memory_utils/utils.py
0 → 100644
View file @
0035b7be
import
torch
from
colossalai.utils
import
get_current_device
from
colossalai.zero.sharded_param.sharded_tensor
import
ShardedTensor
from
colossalai.utils.memory_tracer.model_data_memtracer
import
GLOBAL_MODEL_DATA_TRACER
from
typing
import
Union
def
colo_cuda_memory_capacity
():
"""
Get cuda memory capacity of the current cuda.
"""
return
torch
.
cuda
.
get_device_properties
(
get_current_device
()).
total_memory
def
colo_model_data_tensor_move
(
src_t
:
Union
[
ShardedTensor
,
torch
.
Tensor
],
tgt_t
:
Union
[
ShardedTensor
,
torch
.
Tensor
])
->
None
:
"""
A colossal API for model data tensor move.
The src and target tensors could be resident on both CPU and GPU.
NOTE() The source tensor payload will be removed after this function.
The function will record the communication volume between CPU and GPU.
Args:
t_src (Union[ShardedTensor, torch.Tensor]): source tensor
tgt_t (Union[ShardedTensor, torch.Tensor]): target tensor
"""
if
isinstance
(
src_t
,
ShardedTensor
):
src_t_payload
=
src_t
.
payload
else
:
src_t_payload
=
src_t
.
data
src_dev
=
src_t_payload
.
device
if
isinstance
(
tgt_t
,
ShardedTensor
):
tgt_t_payload
=
tgt_t
.
payload
else
:
tgt_t_payload
=
tgt_t
.
data
tgt_dev
=
tgt_t_payload
.
device
if
src_dev
.
type
==
'cuda'
and
tgt_dev
.
type
==
'cpu'
:
GLOBAL_MODEL_DATA_TRACER
.
delete_tensor
(
src_t_payload
)
elif
src_dev
.
type
==
'cpu'
and
tgt_dev
.
type
==
'cuda'
:
GLOBAL_MODEL_DATA_TRACER
.
add_tensor
(
tgt_t_payload
)
tgt_t_payload
.
copy_
(
src_t_payload
)
# remove payload of src_t
if
isinstance
(
src_t
,
ShardedTensor
):
src_t
.
reset_payload
(
torch
.
tensor
([],
device
=
src_dev
,
dtype
=
src_t_payload
.
dtype
))
else
:
src_t
.
data
=
torch
.
tensor
([],
device
=
src_dev
,
dtype
=
src_t_payload
.
dtype
)
def
colo_model_data_move_to_cpu
(
t
:
torch
.
Tensor
):
assert
isinstance
(
t
,
torch
.
Tensor
)
if
t
.
device
.
type
==
'cpu'
:
return
GLOBAL_MODEL_DATA_TRACER
.
delete_tensor
(
t
)
t
.
data
=
t
.
data
.
cpu
()
colossalai/zero/sharded_model/sharded_model_v2.py
View file @
0035b7be
...
@@ -11,8 +11,7 @@ from colossalai.engine.ophooks import register_ophooks_recursively
...
@@ -11,8 +11,7 @@ from colossalai.engine.ophooks import register_ophooks_recursively
from
colossalai.engine.ophooks.zero_hook
import
ZeroHook
from
colossalai.engine.ophooks.zero_hook
import
ZeroHook
from
colossalai.engine.paramhooks
import
BaseParamHookMgr
from
colossalai.engine.paramhooks
import
BaseParamHookMgr
from
colossalai.logging
import
get_dist_logger
from
colossalai.logging
import
get_dist_logger
from
colossalai.utils.commons.memory
import
col_cuda_memory_capacity
from
colossalai.utils.memory_utils.utils
import
colo_model_data_move_to_cpu
,
colo_cuda_memory_capacity
from
colossalai.utils.memory_tracer.allocator
import
col_move_to_cpu
from
colossalai.utils.memory_tracer.memstats_collector
import
MemStatsCollector
from
colossalai.utils.memory_tracer.memstats_collector
import
MemStatsCollector
from
colossalai.zero.shard_utils
import
BaseShardStrategy
from
colossalai.zero.shard_utils
import
BaseShardStrategy
from
colossalai.zero.sharded_model.reduce_scatter
import
ReduceScatterBucketer
from
colossalai.zero.sharded_model.reduce_scatter
import
ReduceScatterBucketer
...
@@ -152,7 +151,7 @@ class ShardedModelV2(nn.Module):
...
@@ -152,7 +151,7 @@ class ShardedModelV2(nn.Module):
# the way to calculate margin space is based on the assumption that
# the way to calculate margin space is based on the assumption that
# model data is fixed in cuda during training.
# model data is fixed in cuda during training.
# cuda margin space can be used to store OS.
# cuda margin space can be used to store OS.
self
.
_cuda_margin_space
=
col_cuda_memory_capacity
()
-
max
(
self
.
_memstats_collector
.
_overall_cuda
)
self
.
_cuda_margin_space
=
col
o
_cuda_memory_capacity
()
-
max
(
self
.
_memstats_collector
.
_overall_cuda
)
self
.
_iter_cnter
+=
1
self
.
_iter_cnter
+=
1
...
@@ -201,7 +200,7 @@ class ShardedModelV2(nn.Module):
...
@@ -201,7 +200,7 @@ class ShardedModelV2(nn.Module):
else
:
else
:
grad
=
cast_tensor_to_fp32
(
p
.
col_attr
.
fp16_grad
)
grad
=
cast_tensor_to_fp32
(
p
.
col_attr
.
fp16_grad
)
if
p
.
col_attr
.
offload_grad
:
if
p
.
col_attr
.
offload_grad
:
col_move_to_cpu
(
grad
)
col
o_model_data
_move_to_cpu
(
grad
)
if
p
.
col_attr
.
fp32_grad
is
not
None
:
if
p
.
col_attr
.
fp32_grad
is
not
None
:
assert
not
self
.
reuse_fp16_shard
,
'Gradien accumulation is not supported when reuse_fp16_shard=True'
assert
not
self
.
reuse_fp16_shard
,
'Gradien accumulation is not supported when reuse_fp16_shard=True'
p
.
col_attr
.
fp32_grad
.
add_
(
grad
.
view_as
(
p
.
col_attr
.
fp32_grad
))
p
.
col_attr
.
fp32_grad
.
add_
(
grad
.
view_as
(
p
.
col_attr
.
fp32_grad
))
...
...
colossalai/zero/sharded_optim/sharded_optim_v2.py
View file @
0035b7be
...
@@ -25,8 +25,18 @@ class OptimState(Enum):
...
@@ -25,8 +25,18 @@ class OptimState(Enum):
class
ShardedOptimizerV2
(
ColossalaiOptimizer
):
class
ShardedOptimizerV2
(
ColossalaiOptimizer
):
"""A wrapper for optimizer. `ShardedOptimizerV2` and `ShardedModelV2` implement Zero Redundancy Optimizer (ZeRO) stage 3.
"""A wrapper for optimizer. `ShardedOptimizerV2` and `ShardedModelV2` implement Zero Redundancy Optimizer (ZeRO).
You must use `ShardedOptimizerV2` with `ShardedModelV2`.
By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
We apply the Device-aware Operator Placement technique for OS placement from the following paper.
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
https://arxiv.org/abs/2108.05818
GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
which is detected by a runtime memory tracer.
We place as many OS chunks in the margin space as possible.
The size of margin space can be controlled by `gpu_margin_mem_ratio`
If it is set as 0.0, it is the same as classical ZeRO optimizer.
NOTE() You must use `ShardedOptimizerV2` with `ShardedModelV2`.
Args:
Args:
sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the
sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the
...
...
tests/test_utils/test_commons.py
0 → 100644
View file @
0035b7be
from
colossalai.utils.memory_tracer.model_data_memtracer
import
GLOBAL_MODEL_DATA_TRACER
from
colossalai.utils.memory_utils.utils
import
colo_model_data_tensor_move
from
colossalai.utils
import
free_port
from
colossalai.zero.sharded_param
import
ShardedTensor
import
colossalai
import
torch
from
functools
import
partial
import
torch.multiprocessing
as
mp
import
pytest
def
run_tensor_move
(
rank
):
colossalai
.
launch
(
config
=
{},
rank
=
0
,
world_size
=
1
,
host
=
'localhost'
,
port
=
free_port
(),
backend
=
'nccl'
)
assert
(
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
==
0
)
src_t
=
torch
.
ones
(
2
,
3
).
cuda
()
GLOBAL_MODEL_DATA_TRACER
.
add_tensor
(
src_t
)
assert
(
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
==
24
)
tgt_t
=
torch
.
zeros
(
2
,
3
)
colo_model_data_tensor_move
(
src_t
,
tgt_t
)
assert
(
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
==
0
)
assert
(
torch
.
sum
(
tgt_t
)
==
6.0
),
f
"
{
torch
.
sum
(
tgt_t
.
payload
)
}
vs. 6.0"
src_t
=
torch
.
ones
(
2
,
3
)
tgt_t
=
torch
.
zeros
(
2
,
3
).
cuda
().
half
()
colo_model_data_tensor_move
(
src_t
,
tgt_t
)
assert
(
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
==
12
),
f
"cuda usage
{
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
}
"
# the src_t has been removed
assert
(
src_t
.
numel
()
==
0
)
assert
(
torch
.
sum
(
tgt_t
)
==
6.0
),
f
"
{
torch
.
sum
(
tgt_t
.
payload
)
}
vs. 6.0"
src_t
=
ShardedTensor
(
torch
.
ones
(
2
,
3
))
tgt_t
=
ShardedTensor
(
torch
.
zeros
(
2
,
3
).
cuda
().
half
())
colo_model_data_tensor_move
(
src_t
,
tgt_t
)
assert
(
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
==
24
),
f
"cuda usage
{
GLOBAL_MODEL_DATA_TRACER
.
cuda_usage
}
"
assert
(
torch
.
sum
(
tgt_t
.
payload
)
==
6.0
),
f
"
{
torch
.
sum
(
tgt_t
.
payload
)
}
vs. 6.0"
def
test_tensor_move
():
mp
.
spawn
(
run_tensor_move
,
nprocs
=
1
)
if
__name__
==
'__main__'
:
test_tensor_move
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment