fix HPO DDP GPU problem (#19168)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix HPO DDP GPU problem (#19168)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
49629e7b · Wang, Yi · GitHub · 8d59385f · 49629e7b
Unverified Commit 49629e7b authored Sep 23, 2022 by Wang, Yi Committed by GitHub Sep 23, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 7 deletions

src/transformers/integrations.py src/transformers/integrations.py +9 -7

No files found.
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -23,6 +23,7 @@ import pickle
 import shutil
 import sys
 import tempfile
+from dataclasses import asdict
 from pathlib import Path
 from typing import TYPE_CHECKING, Dict, Optional
@@ -195,9 +196,10 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be
            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
            torch.distributed.broadcast_object_list(args_main_rank, src=0)
-            local_rank = trainer.args.local_rank  # backup the local_rank info
+            args = pickle.loads(bytes(args_main_rank))
-            trainer.args = pickle.loads(bytes(args_main_rank))
+            for key, value in asdict(args).items():
-            trainer.args.local_rank = local_rank
+                if key != "local_rank":
+                    setattr(trainer.args, key, value)
            trainer.train(resume_from_checkpoint=None)
            # If there hasn't been any evaluation during the training loop.
            if getattr(trainer, "objective", None) is None:
@@ -429,9 +431,10 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be
            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                raise RuntimeError("only support DDP Sigopt HPO for ParallelMode.DISTRIBUTED currently.")
            torch.distributed.broadcast_object_list(args_main_rank, src=0)
-            local_rank = trainer.args.local_rank  # backup the local_rank info
+            args = pickle.loads(bytes(args_main_rank))
-            trainer.args = pickle.loads(bytes(args_main_rank))
+            for key, value in asdict(args).items():
-            trainer.args.local_rank = local_rank
+                if key != "local_rank":
+                    setattr(trainer.args, key, value)
            trainer.train(resume_from_checkpoint=None)
            # If there hasn't been any evaluation during the training loop.
            if getattr(trainer, "objective", None) is None:
@@ -470,7 +473,6 @@ def run_hp_search_wandb(trainer, n_trials: int, direction: str, **kwargs) -> Bes
        sweep_config["name"] = name
    def _objective():
        run = wandb.run if wandb.run else wandb.init()
        trainer.state.trial_name = run.name
        run.config.update({"assignments": {}, "metric": metric})