"docs/vscode:/vscode.git/clone" did not exist on "83b38fbea8c2a6fbd9af0d9561db3afde2f4f4e2"
Unverified Commit 850cf4af authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Compute `dropout_probability` only in training mode (#24486)



* fix

* fix

* fix

* fix

* fix

* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 9895670e
......@@ -807,8 +807,13 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer
to_drop = False
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None)
else:
if self.gradient_checkpointing and self.training:
......@@ -1052,9 +1057,10 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop):
continue
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop:
continue
past_key_value = past_key_values[idx] if past_key_values is not None else None
......
......@@ -661,9 +661,10 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop):
continue
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop:
continue
past_key_value = past_key_values[idx] if past_key_values is not None else None
......
......@@ -919,8 +919,13 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer
to_drop = False
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None)
else:
# we add position_embeddings as extra input to the encoder_layer
......@@ -1061,9 +1066,10 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop):
continue
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop:
continue
if self.gradient_checkpointing and self.training:
......
......@@ -936,8 +936,13 @@ class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer
to_drop = False
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None)
else:
if self.gradient_checkpointing and self.training:
......@@ -1150,9 +1155,10 @@ class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop):
continue
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop:
continue
past_key_value = past_key_values[idx] if past_key_values is not None else None
......
......@@ -693,9 +693,10 @@ class TrOCRDecoder(TrOCRPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop):
continue
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop:
continue
past_key_value = past_key_values[idx] if past_key_values is not None else None
......
......@@ -915,8 +915,13 @@ class WhisperEncoder(WhisperPreTrainedModel):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer
to_drop = False
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None)
else:
if self.gradient_checkpointing and self.training:
......@@ -1144,9 +1149,10 @@ class WhisperDecoder(WhisperPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop):
continue
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop:
continue
past_key_value = past_key_values[idx] if past_key_values is not None else None
......
......@@ -667,9 +667,10 @@ class XGLMModel(XGLMPreTrainedModel):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
dropout_probability = torch.rand([])
if self.training and (dropout_probability < self.layerdrop):
continue
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop:
continue
past_key_value = past_key_values[idx] if past_key_values is not None else None
......
......@@ -21,7 +21,7 @@ import unittest
from huggingface_hub import hf_hub_download
from transformers import is_torch_available
from transformers.testing_utils import require_torch, slow, torch_device
from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
......@@ -380,6 +380,10 @@ class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
[self.model_tester.num_attention_heads, encoder_seq_length, dim],
)
@is_flaky()
def test_retain_grad_hidden_states_attentions(self):
super().test_retain_grad_hidden_states_attentions()
def prepare_batch(filename="train-batch.pt"):
file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment