Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
62c38e33
Unverified
Commit
62c38e33
authored
Jan 03, 2023
by
HELSON
Committed by
GitHub
Jan 03, 2023
Browse files
[zero] polish low level zero optimizer (#2275)
parent
ac863a01
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
24 deletions
+9
-24
colossalai/zero/sharded_optim/low_level_optim.py
colossalai/zero/sharded_optim/low_level_optim.py
+9
-24
No files found.
colossalai/zero/sharded_optim/low_level_optim.py
View file @
62c38e33
...
@@ -68,9 +68,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -68,9 +68,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
# 2. contiguous gradients
# 2. contiguous gradients
# 3. cpu offload
# 3. cpu offload
# 4. support when some parameters requires_grad = False
# 4. support when some parameters requires_grad = False
super
(
LowLevelZeroOptimizer
,
self
).
__init__
(
optim
=
optimizer
)
self
.
_optimizer
=
optimizer
self
.
_dtype
=
self
.
optim
.
param_groups
[
0
][
'params'
][
0
].
dtype
self
.
_dtype
=
self
.
_optimizer
.
param_groups
[
0
][
'params'
][
0
].
dtype
self
.
_logger
=
get_dist_logger
()
self
.
_logger
=
get_dist_logger
()
self
.
_verbose
=
verbose
self
.
_verbose
=
verbose
...
@@ -116,7 +115,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -116,7 +115,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
self
.
_clip_grad_norm
=
clip_grad_norm
self
.
_clip_grad_norm
=
clip_grad_norm
if
forced_dtype
:
if
forced_dtype
:
for
group
in
self
.
_
optim
izer
.
param_groups
:
for
group
in
self
.
optim
.
param_groups
:
group_params
=
group
[
'params'
]
group_params
=
group
[
'params'
]
for
param
in
group_params
:
for
param
in
group_params
:
param
.
data
=
param
.
data
.
to
(
forced_dtype
)
param
.
data
=
param
.
data
.
to
(
forced_dtype
)
...
@@ -134,7 +133,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -134,7 +133,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
# iterate over the param group in the optimizer
# iterate over the param group in the optimizer
# partition these param groups for data parallel training
# partition these param groups for data parallel training
# and add buffers to parameter store for future access
# and add buffers to parameter store for future access
for
group_id
,
param_group
in
enumerate
(
self
.
_
optim
izer
.
param_groups
):
for
group_id
,
param_group
in
enumerate
(
self
.
optim
.
param_groups
):
group_params
=
param_group
[
'params'
]
group_params
=
param_group
[
'params'
]
# add the fp16 params to fp16_param_groups for bookkeeping
# add the fp16 params to fp16_param_groups for bookkeeping
...
@@ -198,7 +197,9 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -198,7 +197,9 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
if
self
.
_overlap_communication
or
self
.
_partition_grads
:
if
self
.
_overlap_communication
or
self
.
_partition_grads
:
self
.
_attach_reduction_hook
()
self
.
_attach_reduction_hook
()
self
.
_initialize_optimizer_states
()
@
property
def
dtype
(
self
):
return
self
.
_dtype
@
property
@
property
def
loss_scale
(
self
):
def
loss_scale
(
self
):
...
@@ -227,25 +228,9 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -227,25 +228,9 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
parallel_mode
=
self
.
_dp_parallel_mode
)
parallel_mode
=
self
.
_dp_parallel_mode
)
return
params_per_rank
return
params_per_rank
def
_initialize_optimizer_states
(
self
):
# create a dummy zero tensor which has the same shape as that of the param
# set this dummpy zero tensor as grad
for
group_id
in
range
(
len
(
self
.
_fp32_flat_param_groups_of_current_rank
)):
fp32_partition_param
=
self
.
_fp32_flat_param_groups_of_current_rank
[
group_id
]
fp32_partition_grad
=
torch
.
zeros_like
(
fp32_partition_param
)
fp32_partition_param
.
grad
=
fp32_partition_grad
# we do not need log information for optimizer, so comment them
# update the parameter with zero gradients for initialization of optimizer states
# self._optimizer.step()
# remove the grad of the paramter to save memory
# for group_id, fp32_flat_tensor in self._fp32_flat_param_groups_of_current_rank.items():
# fp32_flat_tensor.grad = None
def
_sanity_checks
(
self
):
def
_sanity_checks
(
self
):
assert
torch
.
cuda
.
is_available
(),
'CUDA is required'
assert
torch
.
cuda
.
is_available
(),
'CUDA is required'
for
param_group
in
self
.
_
optim
izer
.
param_groups
:
for
param_group
in
self
.
optim
.
param_groups
:
group_params
=
param_group
[
'params'
]
group_params
=
param_group
[
'params'
]
for
param
in
group_params
:
for
param
in
group_params
:
assert
param
.
dtype
==
self
.
_dtype
,
\
assert
param
.
dtype
==
self
.
_dtype
,
\
...
@@ -484,7 +469,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
...
@@ -484,7 +469,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
self
.
_unscale_and_clip_grads
(
single_grad_partition_groups
,
global_norm
)
self
.
_unscale_and_clip_grads
(
single_grad_partition_groups
,
global_norm
)
# update the parameters
# update the parameters
self
.
_
optim
izer
.
step
()
self
.
optim
.
step
()
# release the fp32 grad
# release the fp32 grad
release_param_grad
(
self
.
_fp32_flat_param_groups_of_current_rank
.
values
())
release_param_grad
(
self
.
_fp32_flat_param_groups_of_current_rank
.
values
())
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment