"tests/vscode:/vscode.git/clone" did not exist on "68ae3be7f514b0e0411664e2a2cdbc6264fd3a0b"
Unverified Commit 881c0df9 authored by Xiaoyang Sun's avatar Xiaoyang Sun Committed by GitHub
Browse files

error bug on saving distributed optim state when using data parallel (#24108)

Update checkpoint_reshaping_and_interoperability.py
parent ee88ae59
......@@ -690,7 +690,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
for j in range(args.target_tensor_model_parallel_size):
for k in range(args.target_data_parallel_size):
if args.target_pipeline_model_parallel_size == 1:
checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}"
checkpoint_dir = f"mp_rank_{j:02d}_{k:03d}"
else:
checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}_{k:03d}"
checkpoint_dir = os.path.join(release_dir, checkpoint_dir)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment