Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
d1211148
Unverified
Commit
d1211148
authored
Mar 30, 2022
by
Jiarui Fang
Committed by
GitHub
Mar 30, 2022
Browse files
[utils] update colo tensor moving APIs (#553)
parent
c44d7970
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
24 additions
and
26 deletions
+24
-26
colossalai/utils/memory_utils/utils.py
colossalai/utils/memory_utils/utils.py
+24
-26
No files found.
colossalai/utils/memory_utils/utils.py
View file @
d1211148
import
torch
from
colossalai.utils
import
get_current_device
from
colossalai.zero.sharded_param.
sharded_tensor
import
S
harded
Tensor
from
colossalai.zero.sharded_param.
tensorful_state
import
S
tateful
Tensor
from
typing
import
Tuple
,
Union
_GLOBAL_CUDA_MEM_FRACTION
=
1.0
def
colo_tensor_mem_usage
(
tensor
:
Union
[
torch
.
Tensor
,
S
harded
Tensor
])
->
Tuple
[
int
,
int
]:
if
is
instanc
e
(
tensor
,
S
harded
Tensor
):
def
colo_tensor_mem_usage
(
tensor
:
Union
[
torch
.
Tensor
,
S
tateful
Tensor
])
->
Tuple
[
int
,
int
]:
if
is
subclass
(
typ
e
(
tensor
)
,
S
tateful
Tensor
):
t
=
tensor
.
payload
elif
isinstance
(
tensor
,
torch
.
Tensor
):
t
=
tensor
...
...
@@ -46,8 +46,8 @@ def colo_cuda_memory_capacity() -> float:
return
torch
.
cuda
.
get_device_properties
(
get_current_device
()).
total_memory
*
_GLOBAL_CUDA_MEM_FRACTION
def
colo_model_data_tensor_move
(
src_t
:
Union
[
S
harded
Tensor
,
torch
.
Tensor
],
tgt_t
:
Union
[
S
harded
Tensor
,
torch
.
Tensor
])
->
None
:
def
colo_model_data_tensor_move
(
src_t
:
Union
[
S
tateful
Tensor
,
torch
.
Tensor
],
tgt_t
:
Union
[
S
tateful
Tensor
,
torch
.
Tensor
])
->
None
:
"""
A colossal API for model data tensor move.
The src and target tensors could be resident on both CPU and GPU.
...
...
@@ -56,46 +56,44 @@ def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t
The function will record the communication volume between CPU and GPU.
Args:
t_src (Union[S
harded
Tensor, torch.Tensor]): source tensor
tgt_t (Union[S
harded
Tensor, torch.Tensor]): target tensor
t_src (Union[S
tateful
Tensor, torch.Tensor]): source tensor
tgt_t (Union[S
tateful
Tensor, torch.Tensor]): target tensor
"""
if
is
instanc
e
(
src_t
,
S
harded
Tensor
):
if
is
subclass
(
typ
e
(
src_t
)
,
S
tateful
Tensor
):
src_t_payload
=
src_t
.
payload
else
:
src_t_payload
=
src_t
.
data
src_dev
=
src_t_payload
.
device
if
is
instanc
e
(
tgt_t
,
S
harded
Tensor
):
if
is
subclass
(
typ
e
(
tgt_t
)
,
S
tateful
Tensor
):
tgt_t_payload
=
tgt_t
.
payload
else
:
tgt_t_payload
=
tgt_t
.
data
tgt_dev
=
tgt_t_payload
.
device
tgt_t_payload
.
copy_
(
src_t_payload
)
# remove payload of src_t
if
is
instanc
e
(
src_t
,
S
harded
Tensor
):
if
is
subclass
(
typ
e
(
src_t
)
,
S
tateful
Tensor
):
src_t
.
reset_payload
(
torch
.
tensor
([],
device
=
src_dev
,
dtype
=
src_t_payload
.
dtype
))
else
:
src_t
.
data
=
torch
.
tensor
([],
device
=
src_dev
,
dtype
=
src_t_payload
.
dtype
)
def
colo_model_data_tensor_move_inline
(
t
:
Union
[
ShardedTensor
,
torch
.
Tensor
],
target_device
:
torch
.
device
,
use_tracer
:
bool
=
True
)
->
None
:
def
colo_model_data_tensor_move_inline
(
t
:
Union
[
StatefulTensor
,
torch
.
Tensor
],
target_device
:
Union
[
torch
.
device
,
int
])
->
None
:
"""
move a tensor to the target_device
Args:
t (Union[S
harded
Tensor, torch.Tensor]): the tensor be moved
t (Union[S
tateful
Tensor, torch.Tensor]): the tensor be moved
"""
if
isinstance
(
t
,
ShardedTensor
):
t_payload
=
t
.
payload
elif
isinstance
(
t
,
torch
.
Tensor
):
if
isinstance
(
t
,
torch
.
Tensor
):
t_payload
=
t
elif
issubclass
(
type
(
t
),
StatefulTensor
):
t_payload
=
t
.
payload
else
:
raise
TypeError
(
'colo_model_data_move_to_cpu dose not accept type {type(t)}'
)
assert
isinstance
(
target_device
,
torch
.
device
)
if
isinstance
(
target_device
,
int
):
target_device
=
torch
.
cuda
(
f
'device"
{
target_device
}
'
)
# deal with torch.device('cpu') and torch.device('cpu:0)
if
t_payload
.
device
.
type
==
target_device
.
type
:
...
...
@@ -103,16 +101,16 @@ def colo_model_data_tensor_move_inline(t: Union[ShardedTensor, torch.Tensor],
t_payload
.
data
=
t_payload
.
data
.
to
(
target_device
)
def
colo_model_data_move_to_cpu
(
t
:
Union
[
S
harded
Tensor
,
torch
.
Tensor
])
->
None
:
def
colo_model_data_move_to_cpu
(
t
:
Union
[
S
tateful
Tensor
,
torch
.
Tensor
])
->
None
:
"""colo_model_data_move_to_cpu
move a model data tensor from gpu to cpu
Args:
t (Union[S
harded
Tensor, torch.Tensor]): _description_
t (Union[S
tateful
Tensor, torch.Tensor]): _description_
"""
if
is
instanc
e
(
t
,
S
harded
Tensor
):
if
is
subclass
(
typ
e
(
t
)
,
S
tateful
Tensor
):
t_payload
=
t
.
payload
elif
isinstance
(
t
,
torch
.
Tensor
):
t_payload
=
t
...
...
@@ -126,17 +124,17 @@ def colo_model_data_move_to_cpu(t: Union[ShardedTensor, torch.Tensor]) -> None:
t_payload
.
data
=
t_payload
.
data
.
cpu
()
def
colo_model_tensor_clone
(
t
:
Union
[
S
harded
Tensor
,
torch
.
Tensor
],
target_device
:
torch
.
device
)
->
torch
.
Tensor
:
def
colo_model_tensor_clone
(
t
:
Union
[
S
tateful
Tensor
,
torch
.
Tensor
],
target_device
:
torch
.
device
)
->
torch
.
Tensor
:
"""
Clone a model data tensor
Args:
t (Union[S
harded
Tensor, torch.Tensor]): a model data tensor
t (Union[S
tateful
Tensor, torch.Tensor]): a model data tensor
target_device (torch.device): the target device
Returns:
torch.Tensor: a cloned torch tensor
"""
t_payload
=
t
.
payload
if
is
instanc
e
(
t
,
S
harded
Tensor
)
else
t
t_payload
=
t
.
payload
if
is
subclass
(
typ
e
(
t
)
,
S
tateful
Tensor
)
else
t
ret
=
t_payload
.
to
(
target_device
)
return
ret
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment