add DDP HPO support for optuna (#19002)

only main_process will have HPO, and pass argument to other process Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add DDP HPO support for optuna (#19002)
only main_process will have HPO, and pass argument to other process Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
d14af22c · Wang, Yi · GitHub · 00fc9217 · d14af22c · d14af22c
Unverified Commit d14af22c authored Sep 13, 2022 by Wang, Yi Committed by GitHub Sep 13, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 20 deletions

src/transformers/integrations.py src/transformers/integrations.py +42 -19

src/transformers/trainer.py src/transformers/trainer.py +1 -1

No files found.
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -159,26 +159,49 @@ def default_hp_search_backend():
 def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
    import optuna
-    def _objective(trial, checkpoint_dir=None):
+    if trainer.args.process_index == 0:
-        checkpoint = None
-        if checkpoint_dir:
-            for subdir in os.listdir(checkpoint_dir):
-                if subdir.startswith(PREFIX_CHECKPOINT_DIR):
-                    checkpoint = os.path.join(checkpoint_dir, subdir)
-        trainer.objective = None
-        trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
-        # If there hasn't been any evaluation during the training loop.
-        if getattr(trainer, "objective", None) is None:
-            metrics = trainer.evaluate()
-            trainer.objective = trainer.compute_objective(metrics)
-        return trainer.objective
-    timeout = kwargs.pop("timeout", None)
+        def _objective(trial, checkpoint_dir=None):
-    n_jobs = kwargs.pop("n_jobs", 1)
+            checkpoint = None
-    study = optuna.create_study(direction=direction, **kwargs)
+            if checkpoint_dir:
-    study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs)
+                for subdir in os.listdir(checkpoint_dir):
-    best_trial = study.best_trial
+                    if subdir.startswith(PREFIX_CHECKPOINT_DIR):
-    return BestRun(str(best_trial.number), best_trial.value, best_trial.params)
+                        checkpoint = os.path.join(checkpoint_dir, subdir)
+            trainer.objective = None
+            trainer._hp_search_setup(trial)
+            if trainer.args.world_size > 1:
+                if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                    raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
+                torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
+            trainer.train(resume_from_checkpoint=checkpoint)
+            # If there hasn't been any evaluation during the training loop.
+            if getattr(trainer, "objective", None) is None:
+                metrics = trainer.evaluate()
+                trainer.objective = trainer.compute_objective(metrics)
+            return trainer.objective
+        timeout = kwargs.pop("timeout", None)
+        n_jobs = kwargs.pop("n_jobs", 1)
+        study = optuna.create_study(direction=direction, **kwargs)
+        study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs)
+        best_trial = study.best_trial
+        return BestRun(str(best_trial.number), best_trial.value, best_trial.params)
+    else:
+        for i in range(n_trials):
+            trainer.objective = None
+            args_main_rank = list(pickle.dumps(trainer.args))
+            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
+                raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
+            torch.distributed.broadcast_object_list(args_main_rank, src=0)
+            local_rank = trainer.args.local_rank  # backup the local_rank info
+            trainer.args = pickle.loads(bytes(args_main_rank))
+            trainer.args.local_rank = local_rank
+            trainer.train(resume_from_checkpoint=None)
+            # If there hasn't been any evaluation during the training loop.
+            if getattr(trainer, "objective", None) is None:
+                metrics = trainer.evaluate()
+                trainer.objective = trainer.compute_objective(metrics)
+        return None
 def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1210,7 +1210,7 @@ class Trainer:
                value = type(old_attr)(value)
            setattr(self.args, key, value)
        if self.hp_search_backend == HPSearchBackend.OPTUNA:
-            logger.info("Trial:", trial.params)
+            logger.info(f"Trial: {trial.params}")
        if self.hp_search_backend == HPSearchBackend.SIGOPT:
            logger.info(f"SigOpt Assignments: {trial.assignments}")
        if self.hp_search_backend == HPSearchBackend.WANDB: