Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
977efdfb
Commit
977efdfb
authored
Jun 08, 2022
by
Lawrence McAfee
Browse files
added backwards compatibility to checkpointing.py.
parent
ef63acd2
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
8 deletions
+12
-8
megatron/checkpointing.py
megatron/checkpointing.py
+12
-8
No files found.
megatron/checkpointing.py
View file @
977efdfb
...
@@ -91,19 +91,23 @@ def get_checkpoint_names(checkpoints_path, iteration, use_distributed_optimizer,
...
@@ -91,19 +91,23 @@ def get_checkpoint_names(checkpoints_path, iteration, use_distributed_optimizer,
# Use both the tensor and pipeline MP rank. If using the distributed
# Use both the tensor and pipeline MP rank. If using the distributed
# optimizer, then the optimizer's path must additionally include the
# optimizer, then the optimizer's path must additionally include the
# data parallel rank.
# data parallel rank.
common_path
=
os
.
path
.
join
(
if
mpu
.
get_pipeline_model_parallel_world_size
()
==
1
:
checkpoints_path
,
common_path
=
os
.
path
.
join
(
checkpoints_path
,
directory
,
directory
,
'mp_rank_{:02d}'
.
format
(
"mp_rank_%02d_%03d"
%
(
mpu
.
get_tensor_model_parallel_rank
()))
else
:
common_path
=
os
.
path
.
join
(
checkpoints_path
,
directory
,
'mp_rank_{:02d}_{:03d}'
.
format
(
mpu
.
get_tensor_model_parallel_rank
(),
mpu
.
get_tensor_model_parallel_rank
(),
mpu
.
get_pipeline_model_parallel_rank
()))
mpu
.
get_pipeline_model_parallel_rank
()))
model_name
=
os
.
path
.
join
(
common_path
,
"model_rng.pt"
)
if
use_distributed_optimizer
:
if
use_distributed_optimizer
:
model_name
=
os
.
path
.
join
(
common_path
,
"model_rng.pt"
)
optim_name
=
os
.
path
.
join
(
optim_name
=
os
.
path
.
join
(
common_path
+
"_%03d"
%
mpu
.
get_data_parallel_rank
(),
common_path
+
"_%03d"
%
mpu
.
get_data_parallel_rank
(),
"optim.pt"
)
"optim.pt"
)
else
:
else
:
optim_name
=
os
.
path
.
join
(
common_path
,
"optim.pt"
)
model_name
=
optim_name
=
os
.
path
.
join
(
common_path
,
"
model_
optim
_rng
.pt"
)
return
model_name
,
optim_name
return
model_name
,
optim_name
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment