HPO: keep the original logic if there's only one process, pass the trial to trainer (#19096)

need to find out solution for following cases *if we need to use trial in model_init, how to do it for non-main rank, sync the model with rank0 in app? *how to use optuna prune feature for DDP, if we do it in rank0, how does other rank know it. Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

HPO: keep the original logic if there's only one process, pass the trial to trainer (#19096)
need to find out solution for following cases *if we need to use trial in model_init, how to do it for non-main rank, sync the model with rank0 in app? *how to use optuna prune feature for DDP, if we do it in rank0, how does other rank know it. Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
6227078d · Wang, Yi · GitHub · 3b0cecb6 · 6227078d
Unverified Commit 6227078d authored Sep 20, 2022 by Wang, Yi Committed by GitHub Sep 19, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 6 deletions

src/transformers/integrations.py src/transformers/integrations.py +12 -6

No files found.
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -168,12 +168,14 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be
                    if subdir.startswith(PREFIX_CHECKPOINT_DIR):
                        checkpoint = os.path.join(checkpoint_dir, subdir)
            trainer.objective = None
-            trainer._hp_search_setup(trial)
            if trainer.args.world_size > 1:
                if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                    raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
+                trainer._hp_search_setup(trial)
                torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
-            trainer.train(resume_from_checkpoint=checkpoint)
+                trainer.train(resume_from_checkpoint=checkpoint)
+            else:
+                trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
            # If there hasn't been any evaluation during the training loop.
            if getattr(trainer, "objective", None) is None:
                metrics = trainer.evaluate()
@@ -362,12 +364,14 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be
            for run in experiment.loop():
                with run:
                    trainer.objective = None
-                    trainer._hp_search_setup(run.run)
                    if trainer.args.world_size > 1:
                        if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                            raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
+                        trainer._hp_search_setup(run.run)
                        torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
-                    trainer.train(resume_from_checkpoint=None)
+                        trainer.train(resume_from_checkpoint=None)
+                    else:
+                        trainer.train(resume_from_checkpoint=None, trial=run.run)
                    # If there hasn't been any evaluation during the training loop.
                    if getattr(trainer, "objective", None) is None:
                        metrics = trainer.evaluate()
@@ -397,12 +401,14 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be
            while experiment.progress.observation_count < experiment.observation_budget:
                suggestion = conn.experiments(experiment.id).suggestions().create()
                trainer.objective = None
-                trainer._hp_search_setup(suggestion)
                if trainer.args.world_size > 1:
                    if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                        raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
+                    trainer._hp_search_setup(suggestion)
                    torch.distributed.broadcast_object_list(pickle.dumps(trainer.args), src=0)
-                trainer.train(resume_from_checkpoint=None)
+                    trainer.train(resume_from_checkpoint=None)
+                else:
+                    trainer.train(resume_from_checkpoint=None, trial=suggestion)
                # If there hasn't been any evaluation during the training loop.
                if getattr(trainer, "objective", None) is None:
                    metrics = trainer.evaluate()