Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
de0d7df3
Unverified
Commit
de0d7df3
authored
Jun 08, 2023
by
digger yu
Committed by
GitHub
Jun 08, 2023
Browse files
[nfc] fix typo colossalai/zero (#3923)
parent
a9d1cadc
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
11 additions
and
11 deletions
+11
-11
colossalai/initialize.py
colossalai/initialize.py
+1
-1
colossalai/zero/gemini/memory_tracer/utils.py
colossalai/zero/gemini/memory_tracer/utils.py
+1
-1
colossalai/zero/legacy/init_ctx/init_context.py
colossalai/zero/legacy/init_ctx/init_context.py
+1
-1
colossalai/zero/legacy/sharded_model/sharded_model_v2.py
colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+3
-3
colossalai/zero/low_level/_utils.py
colossalai/zero/low_level/_utils.py
+1
-1
colossalai/zero/low_level/low_level_optim.py
colossalai/zero/low_level/low_level_optim.py
+4
-4
No files found.
colossalai/initialize.py
View file @
de0d7df3
...
@@ -238,7 +238,7 @@ def initialize(model: nn.Module,
...
@@ -238,7 +238,7 @@ def initialize(model: nn.Module,
loaded into gpc.config.
loaded into gpc.config.
Args:
Args:
model (:class:`torch.nn.Module` or Call
b
ale): Your model instance or a function to build the model.
model (:class:`torch.nn.Module` or Calla
b
le): Your model instance or a function to build the model.
optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
Your optimizer instance.
Your optimizer instance.
criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
...
...
colossalai/zero/gemini/memory_tracer/utils.py
View file @
de0d7df3
...
@@ -7,7 +7,7 @@ def colo_model_optimizer_usage(optim) -> Tuple[int, int]:
...
@@ -7,7 +7,7 @@ def colo_model_optimizer_usage(optim) -> Tuple[int, int]:
"""Trace the optimizer memory usage
"""Trace the optimizer memory usage
Args:
Args:
optim (ShardedOptimV2): an instance of ShardedOptim
v
er
optim (ShardedOptimV2): an instance of ShardedOptim
iz
er
Returns:
Returns:
Tuple[int, int]: cuda/cpu memory usage in Byte
Tuple[int, int]: cuda/cpu memory usage in Byte
...
...
colossalai/zero/legacy/init_ctx/init_context.py
View file @
de0d7df3
...
@@ -46,7 +46,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
...
@@ -46,7 +46,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
"""A context to initialize model.
"""A context to initialize model.
1. Convert the model to fp16.
1. Convert the model to fp16.
2. The param
a
ters of the module are adapted to type ShardedParameter.
2. The param
e
ters of the module are adapted to type ShardedParameter.
3. Shard the param and grad according to flags.
3. Shard the param and grad according to flags.
Args:
Args:
...
...
colossalai/zero/legacy/sharded_model/sharded_model_v2.py
View file @
de0d7df3
...
@@ -69,7 +69,7 @@ class ShardedModelV2(nn.Module):
...
@@ -69,7 +69,7 @@ class ShardedModelV2(nn.Module):
If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
Note that 'auto' policy can only work well when no other processes use CUDA during your training.
Note that 'auto' policy can only work well when no other processes use CUDA during your training.
Defaults to 'cuda'.
Defaults to 'cuda'.
gradient_predivide_factor (Optional[float], optional): Gradient is divi
v
ed by this value before reduce-scatter. Defaults to 1.0.
gradient_predivide_factor (Optional[float], optional): Gradient is divi
d
ed by this value before reduce-scatter. Defaults to 1.0.
reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad.
reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad.
Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation.
Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation.
In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
...
@@ -205,7 +205,7 @@ class ShardedModelV2(nn.Module):
...
@@ -205,7 +205,7 @@ class ShardedModelV2(nn.Module):
exit(0)
exit(0)
"""
"""
if
self
.
_use_memory_tracer
:
if
self
.
_use_memory_tracer
:
self
.
logger
.
error
(
f
'dump memor
t
tracer collected information to a
{
filename
}
'
,
ranks
=
[
0
])
self
.
logger
.
error
(
f
'dump memor
y
tracer collected information to a
{
filename
}
'
,
ranks
=
[
0
])
if
gpc
.
get_global_rank
()
==
0
:
if
gpc
.
get_global_rank
()
==
0
:
with
open
(
filename
,
'w+'
)
as
f
:
with
open
(
filename
,
'w+'
)
as
f
:
f
.
write
(
f
'cuda reserved
{
torch
.
cuda
.
memory_reserved
(
get_current_device
())
/
1e9
}
GB
\n
'
)
f
.
write
(
f
'cuda reserved
{
torch
.
cuda
.
memory_reserved
(
get_current_device
())
/
1e9
}
GB
\n
'
)
...
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
...
@@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
# make parameters point to gradient
# make parameters point to gradient
assert
param
.
colo_attr
.
saved_grad
.
is_null
(
assert
param
.
colo_attr
.
saved_grad
.
is_null
(
),
'Gradien accumulation is not supported when reuse_fp16_shard=True'
),
'Gradien
t
accumulation is not supported when reuse_fp16_shard=True'
param
.
colo_attr
.
grad_payload_reset
(
grad
.
data
)
param
.
colo_attr
.
grad_payload_reset
(
grad
.
data
)
# release the memory of param
# release the memory of param
...
...
colossalai/zero/low_level/_utils.py
View file @
de0d7df3
...
@@ -261,7 +261,7 @@ def sync_param(flat_tensor, tensor_list):
...
@@ -261,7 +261,7 @@ def sync_param(flat_tensor, tensor_list):
share the same memory space. This function will update the tensor list so that
share the same memory space. This function will update the tensor list so that
they point to the same value.
they point to the same value.
:param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor l
s
it
:param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor li
s
t
:param tensor_list: A list of tensors corresponding to the flattened tensor
:param tensor_list: A list of tensors corresponding to the flattened tensor
:type flat_tensor: torch.Tensor
:type flat_tensor: torch.Tensor
:type tensor_list: List[torch.Tensor]
:type tensor_list: List[torch.Tensor]
...
...
colossalai/zero/low_level/low_level_optim.py
View file @
de0d7df3
...
@@ -207,8 +207,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -207,8 +207,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
for
param
in
self
.
_working_param_groups
[
group_id
]:
for
param
in
self
.
_working_param_groups
[
group_id
]:
self
.
_param_store
.
set_param_reduction_state
(
param
,
False
)
self
.
_param_store
.
set_param_reduction_state
(
param
,
False
)
# intialize communication stream for
# in
i
tialize communication stream for
# communication-compuation overlapping
# communication-compu
t
ation overlapping
if
self
.
_overlap_communication
:
if
self
.
_overlap_communication
:
self
.
_comm_stream
=
torch
.
cuda
.
Stream
()
self
.
_comm_stream
=
torch
.
cuda
.
Stream
()
...
@@ -269,7 +269,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -269,7 +269,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
params_per_rank
=
[[]
for
_
in
range
(
self
.
_world_size
)]
params_per_rank
=
[[]
for
_
in
range
(
self
.
_world_size
)]
numel_per_rank
=
[
0
for
_
in
range
(
self
.
_world_size
)]
numel_per_rank
=
[
0
for
_
in
range
(
self
.
_world_size
)]
# partiti
t
on the parameters in a greedy fashion
# partition the parameters in a greedy fashion
sorted_params
=
sorted
(
param_list
,
key
=
lambda
x
:
x
.
numel
(),
reverse
=
True
)
sorted_params
=
sorted
(
param_list
,
key
=
lambda
x
:
x
.
numel
(),
reverse
=
True
)
for
param
in
sorted_params
:
for
param
in
sorted_params
:
# allocate this parameter to the rank with
# allocate this parameter to the rank with
...
@@ -297,7 +297,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -297,7 +297,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
param_group
=
self
.
_working_param_groups
[
group_id
]
param_group
=
self
.
_working_param_groups
[
group_id
]
for
param
in
param_group
:
for
param
in
param_group
:
if
param
.
requires_grad
:
if
param
.
requires_grad
:
# determines the reduction desti
o
nation rank
# determines the reduction destination rank
# this is only valid for stage 2
# this is only valid for stage 2
# dst_rank = None means using all-reduce
# dst_rank = None means using all-reduce
# else using reduce
# else using reduce
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment