Added option to save checkpoints using Path Manager.

Summary: Added option to save checkpoints using Path Manager. Reviewed By: hudeven Differential Revision: D17392754 fbshipit-source-id: 4b8e556ef8455a1548e5a083d779ed809cd785be

Added option to save checkpoints using Path Manager.
Summary: Added option to save checkpoints using Path Manager. Reviewed By: hudeven Differential Revision: D17392754 fbshipit-source-id: 4b8e556ef8455a1548e5a083d779ed809cd785be
d80ad54f · Sujit Verma · Facebook Github Bot · 02b74c58 · d80ad54f · d80ad54f
Commit d80ad54f authored Oct 11, 2019 by Sujit Verma Committed by Facebook Github Bot Oct 11, 2019
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 6 deletions

fairseq/checkpoint_utils.py fairseq/checkpoint_utils.py +24 -5

fairseq/trainer.py fairseq/trainer.py +7 -1

train.py train.py +11 -0

No files found.
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -65,6 +65,10 @@ def save_checkpoint(args, trainer, epoch_itr, val_loss):
    if len(checkpoints) > 0:
        trainer.save_checkpoint(checkpoints[0], extra_state)
        for cp in checkpoints[1:]:
+            try:
+                from fairseq.fb_pathmgr import fb_pathmgr
+                fb_pathmgr.copy(checkpoints[0], cp, True)
+            except (ModuleNotFoundError, ImportError):
                shutil.copyfile(checkpoints[0], cp)
        write_timer.stop()
@@ -132,6 +136,14 @@ def load_checkpoint(args, trainer, data_selector=None):
 def load_checkpoint_to_cpu(path, arg_overrides=None):
    """Loads a checkpoint to CPU (with upgrading for backward compatibility)."""
+    try:
+        from fairseq.fb_pathmgr import fb_pathmgr
+        with fb_pathmgr.open(path, "rb") as f:
+            state = torch.load(
+                f, map_location=lambda s, l: default_restore_location(s, 'cpu'),
+            )
+    except (ModuleNotFoundError, ImportError):
+        # if path manager not found, continue with local file.
        state = torch.load(
            path, map_location=lambda s, l: default_restore_location(s, 'cpu'),
        )
@@ -244,6 +256,13 @@ def save_state(
        state_dict['criterion'] = criterion.state_dict()
    if not args.no_save_optimizer_state:
        state_dict['last_optimizer_state'] = convert_state_dict_type(optimizer.state_dict())
+    try:
+        from fairseq.fb_pathmgr import fb_pathmgr
+        with fb_pathmgr.open(filename, "wb") as f:
+            torch_persistent_save(state_dict, f)
+    except (ModuleNotFoundError, ImportError):
+        # if path manager not found, continue with local file.
        torch_persistent_save(state_dict, filename)

--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -170,7 +170,13 @@ class Trainer(object):
        """Load all training state from a checkpoint file."""
        extra_state, self._optim_history, last_optim_state = None, [], None
-        if os.path.exists(filename):
+        try:
+            from fairseq.fb_pathmgr import fb_pathmgr
+            bexists = fb_pathmgr.isfile(filename)
+        except Exception:
+            bexists = os.path.exists(filename)
+        if bexists:
            state = checkpoint_utils.load_checkpoint_to_cpu(filename)
            # load model parameters

--- a/train.py
+++ b/train.py
@@ -19,10 +19,21 @@ from fairseq.data import iterators
 from fairseq.trainer import Trainer
 from fairseq.meters import AverageMeter, StopwatchMeter
+fb_pathmgr_registerd = False
 def main(args, init_distributed=False):
    utils.import_user_module(args)
+    try:
+        from fairseq.fb_pathmgr import fb_pathmgr
+        global fb_pathmgr_registerd
+        if not fb_pathmgr_registerd:
+            fb_pathmgr.register()
+            fb_pathmgr_registerd = True
+    except (ModuleNotFoundError, ImportError):
+        pass
    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'