Merge branch 'load_checkpoint_fix' into 'master'

fixed deserializing issue with old checkpoint See merge request ADLR/megatron-lm!5

Merge branch 'load_checkpoint_fix' into 'master'
fixed deserializing issue with old checkpoint See merge request ADLR/megatron-lm!5
6b68bb8a · Raul Puri · 9993ea25 · ee38e7f9 · 6b68bb8a
Commit 6b68bb8a authored Oct 09, 2019 by Raul Puri
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 1 deletion

megatron/utils.py megatron/utils.py +13 -1

No files found.
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -338,7 +338,19 @@ def load_checkpoint(model, optimizer, lr_scheduler, args):
            torch.distributed.get_rank(), checkpoint_name))
    # Load the checkpoint.
-    sd = torch.load(checkpoint_name, map_location='cpu')
+    try:
+        sd = torch.load(checkpoint_name, map_location='cpu')
+    except ModuleNotFoundError:
+        # For backward compatibility.
+        print_rank_0(' > deserializing using the old code structure ...')
+        import sys
+        sys.modules['fp16.loss_scaler'] = sys.modules[
+            'megatron.fp16.loss_scaler']
+        sd = torch.load(checkpoint_name, map_location='cpu')
+        sys.modules.pop('fp16.loss_scaler', None)
+    except:
+        print_rank_0('could not load the checkpoint')
+        exit()
    # Iterations.
    if args.finetune or release: