Sagemaker Model Parallel tensoboard writing fix (#10403)

* Added tb fix * Removed local rank condition * Updated reference to args

Sagemaker Model Parallel tensoboard writing fix (#10403)
* Added tb fix * Removed local rank condition * Updated reference to args
7fc686ef · Mansi Mane · GitHub · 83d2d55c · 7fc686ef
Unverified Commit 7fc686ef authored Feb 26, 2021 by Mansi Mane Committed by GitHub Feb 26, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 1 deletion

src/transformers/sagemaker/trainer_sm.py src/transformers/sagemaker/trainer_sm.py +11 -1

No files found.
--- a/src/transformers/sagemaker/trainer_sm.py
+++ b/src/transformers/sagemaker/trainer_sm.py
@@ -71,11 +71,21 @@ if is_smdistributed_available():

 class SageMakerTrainer(Trainer):
    def __init__(self, args=None, **kwargs):
+        self.is_model_parallel_enabled = is_smdistributed_available() and args.mp_parameters != ""
        super().__init__(args=args, **kwargs)
-        self.is_model_parallel_enabled = is_smdistributed_available() and self.args.mp_parameters != ""
        if self.is_model_parallel_enabled and self.args.gradient_accumulation_steps != 1:
            raise ValueError("Gradient accumulation is not supported when model parallel is enabled.")

+    def is_world_process_zero(self) -> bool:
+        """
+        Whether or not this process is the global main process (when training in a distributed fashion on several
+        machines, this is only going to be :obj:`True` for one process).
+        """
+        if self.is_model_parallel_enabled:
+            return smp.rank() == 0 and smp.local_rank() == 0 and smp.mp_rank() == 0 and smp.dp_rank() == 0
+        else:
+            return super.is_world_process_zero()
+
    def _get_train_sampler(self):
        if self.is_model_parallel_enabled:
            if self.args.group_by_length: