Compute `dropout_probability` only in training mode (SpeechT5) (#24498)

fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Compute `dropout_probability` only in training mode (SpeechT5) (#24498)
fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
7b4e3b5b · Yih-Dar · GitHub · c9fd4985 · 7b4e3b5b
Unverified Commit 7b4e3b5b authored Jun 26, 2023 by Yih-Dar Committed by GitHub Jun 26, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 5 deletions

src/transformers/models/speecht5/modeling_speecht5.py src/transformers/models/speecht5/modeling_speecht5.py +8 -5

No files found.
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -1380,9 +1380,11 @@ class SpeechT5Encoder(SpeechT5PreTrainedModel):
                all_hidden_states = all_hidden_states + (hidden_states,)

            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
+            skip_the_layer = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                skip_the_layer = dropout_probability < self.layerdrop

-            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
            if not skip_the_layer or deepspeed_zero3_is_enabled:
                # under deepspeed zero3 all gpus must run in sync
                if self.gradient_checkpointing and self.training:
@@ -1705,9 +1707,10 @@ class SpeechT5Decoder(SpeechT5PreTrainedModel):
                all_hidden_states = all_hidden_states + (hidden_states,)

            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-
-            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
+            skip_the_layer = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                skip_the_layer = dropout_probability < self.layerdrop
            if skip_the_layer and not deepspeed_zero3_is_enabled:
                continue