add DDP HPO support for sigopt (#18931)

only main_process will have HPO, and pass argument to other process Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add DDP HPO support for sigopt (#18931)
only main_process will have HPO, and pass argument to other process Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
a86acb75 · Wang, Yi · GitHub · 9faa9f9d · a86acb75
Unverified Commit a86acb75 authored Sep 12, 2022 by Wang, Yi Committed by GitHub Sep 12, 2022
Show whitespace changes
Inline Side-by-side

Showing with 79 additions and 48 deletions

src/transformers/integrations.py src/transformers/integrations.py +79 -48

No files found.
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -19,6 +19,7 @@ import importlib.util
 import json
 import numbers
 import os
+import pickle
 import shutil
 import sys
 import tempfile
@@ -28,11 +29,13 @@ from typing import TYPE_CHECKING, Dict, Optional
 import numpy as np
 from . import __version__ as version
-from .utils import flatten_dict, is_datasets_available, logging
+from .utils import flatten_dict, is_datasets_available, is_torch_available, logging
 logger = logging.get_logger(__name__)
+if is_torch_available():
+    import torch
 # comet_ml requires to be imported before any ML frameworks
 _has_comet = importlib.util.find_spec("comet_ml") is not None and os.getenv("COMET_MODE", "").upper() != "DISABLED"
@@ -55,6 +58,7 @@ if TYPE_CHECKING and _has_neptune:
 from .trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402
 from .trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # noqa: E402
+from .training_args import ParallelMode  # noqa: E402
 from .utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402
@@ -317,6 +321,7 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be
    import sigopt
    from transformers.utils.versions import importlib_metadata
+    if trainer.args.process_index == 0:
        if importlib_metadata.version("sigopt") >= "8.0.0":
            sigopt.set_project("huggingface")
@@ -334,7 +339,12 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be
            for run in experiment.loop():
                with run:
                    trainer.objective = None
-                trainer.train(resume_from_checkpoint=None, trial=run.run)
+                    trainer._hp_search_setup(run.run)
+                    if trainer.args.world_size > 1:
+                        if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                            raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
+                        torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
+                    trainer.train(resume_from_checkpoint=None)
                    # If there hasn't been any evaluation during the training loop.
                    if getattr(trainer, "objective", None) is None:
                        metrics = trainer.evaluate()
@@ -364,7 +374,12 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be
            while experiment.progress.observation_count < experiment.observation_budget:
                suggestion = conn.experiments(experiment.id).suggestions().create()
                trainer.objective = None
-            trainer.train(resume_from_checkpoint=None, trial=suggestion)
+                trainer._hp_search_setup(suggestion)
+                if trainer.args.world_size > 1:
+                    if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                        raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
+                    torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
+                trainer.train(resume_from_checkpoint=None)
                # If there hasn't been any evaluation during the training loop.
                if getattr(trainer, "objective", None) is None:
                    metrics = trainer.evaluate()
@@ -378,6 +393,22 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be
            best = list(conn.experiments(experiment.id).best_assignments().fetch().iterate_pages())[0]
            best_run = BestRun(best.id, best.value, best.assignments)
        return best_run
+    else:
+        for i in range(n_trials):
+            trainer.objective = None
+            args_main_rank = list(pickle.dumps(trainer.args))
+            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
+            torch.distributed.broadcast_object_list(args_main_rank, src=0)
+            local_rank = trainer.args.local_rank  # backup the local_rank info
+            trainer.args = pickle.loads(bytes(args_main_rank))
+            trainer.args.local_rank = local_rank
+            trainer.train(resume_from_checkpoint=None)
+            # If there hasn't been any evaluation during the training loop.
+            if getattr(trainer, "objective", None) is None:
+                metrics = trainer.evaluate()
+                trainer.objective = trainer.compute_objective(metrics)
+        return None
 def run_hp_search_wandb(trainer, n_trials: int, direction: str, **kwargs) -> BestRun: