Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
160ba680
Commit
160ba680
authored
Jan 04, 2021
by
mohammad
Browse files
added reload model params for finetuning
parent
43529f78
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
30 additions
and
4 deletions
+30
-4
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+29
-2
tasks/finetune_utils.py
tasks/finetune_utils.py
+1
-2
No files found.
megatron/optimizer/optimizer.py
View file @
160ba680
...
...
@@ -76,6 +76,10 @@ class MegatronOptimizer(ABC):
def
step
(
self
):
pass
@
abstractmethod
def
reload_model_params
(
self
):
pass
@
abstractmethod
def
state_dict
(
self
):
pass
...
...
@@ -243,8 +247,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
return
found_inf_flag
def
_copy_master_params_to_model_params
(
self
):
# Only needed for the fp16 params.
def
_get_model_and_master_params_data_fp16
(
self
):
model_data
=
[]
master_data
=
[]
for
model_group
,
master_group
in
zip
(
self
.
fp16_groups
,
...
...
@@ -252,6 +255,12 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
for
model_param
,
master_param
in
zip
(
model_group
,
master_group
):
model_data
.
append
(
model_param
.
data
)
master_data
.
append
(
master_param
.
data
)
return
model_data
,
master_data
def
_copy_master_params_to_model_params
(
self
):
# Only needed for the fp16 params.
model_data
,
master_data
=
self
.
_get_model_and_master_params_data_fp16
()
self
.
_dummy_overflow_buf
.
fill_
(
0
)
# Scaling with factor `1.0` is equivalent to copy.
multi_tensor_applier
(
amp_C
.
multi_tensor_scale
,
...
...
@@ -259,6 +268,20 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
[
master_data
,
model_data
],
1.0
)
def
_copy_model_params_to_master_params
(
self
):
# Only needed for the fp16 params.
model_data
,
master_data
=
self
.
_get_model_and_master_params_data_fp16
()
self
.
_dummy_overflow_buf
.
fill_
(
0
)
# Scaling with factor `1.0` is equivalent to copy.
multi_tensor_applier
(
amp_C
.
multi_tensor_scale
,
self
.
_dummy_overflow_buf
,
[
model_data
,
master_data
],
1.0
)
def
reload_model_params
(
self
):
self
.
_copy_model_params_to_master_params
()
@
torch
.
no_grad
()
def
step
(
self
):
...
...
@@ -388,6 +411,10 @@ class FP32Optimizer(MegatronOptimizer):
return
True
def
reload_model_params
(
self
):
pass
def
state_dict
(
self
):
return
self
.
optimizer
.
state_dict
()
...
...
tasks/finetune_utils.py
View file @
160ba680
...
...
@@ -256,8 +256,7 @@ def finetune(train_valid_datasets_provider, model_provider,
args
.
load
=
original_load
# This is critical when only model is loaded. We should make sure
# master parameters are also updated.
if
args
.
fp16
:
optimizer
.
_model_params_to_master_params
()
optimizer
.
reload_model_params
()
timers
(
'pretrained checkpoint'
).
stop
()
# Print setup timing.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment