Unverified Commit 7b4e3b5b authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Compute `dropout_probability` only in training mode (SpeechT5) (#24498)



fix
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent c9fd4985
......@@ -1380,9 +1380,11 @@ class SpeechT5Encoder(SpeechT5PreTrainedModel):
all_hidden_states = all_hidden_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = torch.rand([])
skip_the_layer = False
if self.training:
dropout_probability = torch.rand([])
skip_the_layer = dropout_probability < self.layerdrop
skip_the_layer = self.training and (dropout_probability < self.layerdrop)
if not skip_the_layer or deepspeed_zero3_is_enabled:
# under deepspeed zero3 all gpus must run in sync
if self.gradient_checkpointing and self.training:
......@@ -1705,9 +1707,10 @@ class SpeechT5Decoder(SpeechT5PreTrainedModel):
all_hidden_states = all_hidden_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = torch.rand([])
skip_the_layer = self.training and (dropout_probability < self.layerdrop)
skip_the_layer = False
if self.training:
dropout_probability = torch.rand([])
skip_the_layer = dropout_probability < self.layerdrop
if skip_the_layer and not deepspeed_zero3_is_enabled:
continue
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment