Unverified Commit 881c0df9 authored by Xiaoyang Sun's avatar Xiaoyang Sun Committed by GitHub
Browse files

error bug on saving distributed optim state when using data parallel (#24108)

Update checkpoint_reshaping_and_interoperability.py
parent ee88ae59
...@@ -690,7 +690,7 @@ def convert_checkpoint_from_transformers_to_megatron(args): ...@@ -690,7 +690,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
for j in range(args.target_tensor_model_parallel_size): for j in range(args.target_tensor_model_parallel_size):
for k in range(args.target_data_parallel_size): for k in range(args.target_data_parallel_size):
if args.target_pipeline_model_parallel_size == 1: if args.target_pipeline_model_parallel_size == 1:
checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}" checkpoint_dir = f"mp_rank_{j:02d}_{k:03d}"
else: else:
checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}_{k:03d}" checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}_{k:03d}"
checkpoint_dir = os.path.join(release_dir, checkpoint_dir) checkpoint_dir = os.path.join(release_dir, checkpoint_dir)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment